diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9cfa85d47029b..86e31e702f9b0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -18770,8 +18770,11 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { : &AMDGPU::SReg_32RegClass; if (!TRI->isSGPRClass(RC) && !isDivergent) return TRI->getEquivalentSGPRClass(RC); - if (TRI->isSGPRClass(RC) && isDivergent) + if (TRI->isSGPRClass(RC) && isDivergent) { + if (Subtarget->hasGFX90AInsts()) + return TRI->getEquivalentAVClass(RC); return TRI->getEquivalentVGPRClass(RC); + } return RC; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 0a5b51a33f02d..a48c9a26420ed 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3638,6 +3638,14 @@ SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { return ARC; } +const TargetRegisterClass * +SIRegisterInfo::getEquivalentAVClass(const TargetRegisterClass *SRC) const { + unsigned Size = getRegSizeInBits(*SRC); + const TargetRegisterClass *ARC = getVectorSuperClassForBitWidth(Size); + assert(ARC && "Invalid register class size"); + return ARC; +} + const TargetRegisterClass * SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { unsigned Size = getRegSizeInBits(*VRC); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 365a76d4a5006..a6af25dfd7d6f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -289,6 +289,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const; + /// \returns An AGPR+VGPR super reg class with the same width as \p SRC + const TargetRegisterClass * + getEquivalentAVClass(const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const; diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index ae83766cd6a4a..196958b74442f 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -644,10 +644,10 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ; def a[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -659,7 +659,7 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr2_agpr3 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -672,8 +672,8 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword a2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword a3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -758,7 +758,7 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB12_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB12_4 @@ -768,8 +768,8 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB12_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -926,12 +926,12 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -939,23 +939,23 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB14_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB14_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB14_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -1016,12 +1016,12 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1029,23 +1029,23 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB15_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB15_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB15_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -1220,7 +1220,7 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB17_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB17_4 @@ -1230,8 +1230,8 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -1294,12 +1294,12 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1307,23 +1307,23 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB18_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB18_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -1384,12 +1384,10 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 @@ -1406,14 +1404,14 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB19_2 ; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1559,12 +1557,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1589,12 +1587,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB21_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1611,24 +1609,24 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1641,23 +1639,23 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_v: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB22_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1694,12 +1692,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1723,12 +1721,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB23_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1745,23 +1743,23 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1774,22 +1772,22 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB24_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1810,23 +1808,23 @@ define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1839,22 +1837,22 @@ define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_v: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB25_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1891,12 +1889,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1920,12 +1918,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB26_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1942,24 +1940,24 @@ define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1972,23 +1970,23 @@ define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB27_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2009,23 +2007,23 @@ define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_v_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2038,22 +2036,22 @@ define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_v_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB28_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2073,24 +2071,25 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX90A-NEXT: ;;#ASMSTART @@ -2128,30 +2127,31 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33 -; GFX90A-NEXT: flat_load_dword v1, v[4:5] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX90A-NEXT: flat_load_dword v1, v[2:3] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ; def a34 ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v0 +; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v1, v[4:5], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2190,53 +2190,58 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a32 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a33, v1 ; GFX950-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a34 +; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 @@ -2270,28 +2275,26 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a33 -; GFX950-NEXT: flat_load_dword v1, v[4:5] -; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v0 -; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX950-NEXT: flat_load_dword v1, v[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v3, v1 -; GFX950-NEXT: v_xor_b32_e32 v2, v3, v0 +; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap v1, v[4:5], v[2:3] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB29_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a32, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2330,24 +2333,25 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a32 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 @@ -2511,14 +2515,14 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB32_4: ; %Flow3 @@ -2572,14 +2576,14 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB32_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB32_4: ; %Flow3 @@ -2619,10 +2623,10 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB33_4 @@ -2632,40 +2636,40 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB33_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB33_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB33_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2678,9 +2682,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB33_4 @@ -2690,37 +2694,37 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-NEXT: .LBB33_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB33_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB33_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB33_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 -; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB33_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -2759,14 +2763,14 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB34_4: ; %Flow3 @@ -2818,14 +2822,14 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB34_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB34_4: ; %Flow3 @@ -2864,9 +2868,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB35_4 @@ -2876,40 +2880,40 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB35_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB35_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB35_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2920,9 +2924,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB35_4 @@ -2932,37 +2936,37 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: .LBB35_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB35_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB35_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB35_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 -; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB35_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -2981,9 +2985,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB36_4 @@ -2993,40 +2997,40 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: .LBB36_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB36_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB36_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB36_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB36_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3037,9 +3041,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB36_4 @@ -3049,37 +3053,37 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-NEXT: .LBB36_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB36_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB36_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB36_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 -; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB36_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -3118,14 +3122,14 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB37_4: ; %Flow3 @@ -3177,14 +3181,14 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB37_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB37_4: ; %Flow3 @@ -3224,10 +3228,10 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB38_4 @@ -3237,40 +3241,40 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-NEXT: .LBB38_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB38_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB38_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB38_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3283,9 +3287,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB38_4 @@ -3295,37 +3299,37 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-NEXT: .LBB38_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB38_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB38_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB38_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 -; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB38_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -3344,9 +3348,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB39_4 @@ -3356,40 +3360,40 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB39_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB39_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB39_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB39_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3400,9 +3404,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB39_4 @@ -3412,37 +3416,37 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-NEXT: .LBB39_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB39_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB39_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB39_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 -; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB39_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -5439,13 +5443,13 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -5468,12 +5472,12 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB69_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -5492,44 +5496,44 @@ define void @flat_atomic_nand_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2 -; GFX90A-NEXT: v_not_b32_e32 v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_nand_i32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB70_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6061,13 +6065,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6092,13 +6096,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB85_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6117,25 +6121,25 @@ define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2 -; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6145,26 +6149,26 @@ define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v3 -; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2 -; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB86_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 @@ -6192,12 +6196,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6220,12 +6224,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB87_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6241,20 +6245,20 @@ define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_usub_sat_i32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6267,20 +6271,20 @@ define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_usub_sat_i32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB88_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6402,35 +6406,35 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_add_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB90_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB90_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB90_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB90_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6587,35 +6591,35 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_sub_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB92_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB92_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB92_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB92_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6885,14 +6889,14 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB95_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB95_4: ; %Flow3 @@ -6950,14 +6954,14 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB95_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB95_4: ; %Flow3 @@ -6994,58 +6998,58 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_nand_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB96_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[2:3] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB96_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_and_b32_e32 v4, v7, v1 -; GFX90A-NEXT: v_and_b32_e32 v8, v6, v0 -; GFX90A-NEXT: v_not_b32_e32 v5, v4 -; GFX90A-NEXT: v_not_b32_e32 v4, v8 -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4 +; GFX90A-NEXT: v_not_b32_e32 v1, v0 +; GFX90A-NEXT: v_not_b32_e32 v0, v8 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB96_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB96_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB96_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX90A-NEXT: v_not_b32_e32 v0, v0 -; GFX90A-NEXT: v_not_b32_e32 v1, v1 -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB96_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7055,54 +7059,54 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB96_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB96_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_and_b32_e32 v2, v9, v1 -; GFX950-NEXT: v_and_b32_e32 v3, v8, v0 -; GFX950-NEXT: v_not_b32_e32 v7, v2 -; GFX950-NEXT: v_not_b32_e32 v6, v3 -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX950-NEXT: v_and_b32_e32 v8, v2, v4 +; GFX950-NEXT: v_not_b32_e32 v1, v0 +; GFX950-NEXT: v_not_b32_e32 v0, v8 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB96_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB96_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB96_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX950-NEXT: v_not_b32_e32 v1, v1 -; GFX950-NEXT: v_not_b32_e32 v0, v0 -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_and_b32_e32 v2, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v4, v0, v4 +; GFX950-NEXT: v_not_b32_e32 v3, v2 +; GFX950-NEXT: v_not_b32_e32 v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB96_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -8492,14 +8496,14 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB111_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB111_4: ; %Flow3 @@ -8561,14 +8565,14 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB111_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB111_4: ; %Flow3 @@ -8607,60 +8611,60 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB112_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[2:3] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB112_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v0 -; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v1, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB112_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB112_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB112_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v4, v0 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v1, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB112_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8670,60 +8674,60 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB112_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB112_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc -; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB112_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB112_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB112_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v1, vcc -; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB112_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -8764,14 +8768,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB113_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB113_4: ; %Flow3 @@ -8831,14 +8835,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB113_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB113_4: ; %Flow3 @@ -8877,58 +8881,58 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB114_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 -; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB114_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB114_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB114_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v4, v2 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v3, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8938,58 +8942,58 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB114_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB114_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB114_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB114_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB114_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB114_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -10009,10 +10013,10 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -10020,39 +10024,39 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB128_6 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v5 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB128_3 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB128_3: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB128_5 ; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB128_5: ; %Flow1 ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB128_6: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB128_8 ; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc -; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: .LBB128_8: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] @@ -10249,23 +10253,23 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB130_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -10274,19 +10278,19 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execnz .LBB130_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB130_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB130_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB130_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART @@ -10406,31 +10410,31 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB132_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB132_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB132_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] @@ -10595,31 +10599,31 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB134_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB134_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB134_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] @@ -10729,16 +10733,15 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -10997,16 +11000,15 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART @@ -11237,12 +11239,12 @@ define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB139_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11274,20 +11276,20 @@ define void @flat_atomic_fadd_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fadd_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB140_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB140_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11334,12 +11336,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11362,12 +11364,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB141_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11383,20 +11385,20 @@ define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fsub_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB142_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB142_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11409,20 +11411,20 @@ define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fsub_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB142_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11457,13 +11459,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11488,13 +11490,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB143_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11510,22 +11512,22 @@ define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB144_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB144_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11538,23 +11540,23 @@ define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmax_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_max_f16 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB144_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11589,13 +11591,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11620,13 +11622,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB145_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11642,22 +11644,22 @@ define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB146_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11670,23 +11672,23 @@ define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmin_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_min_f16 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB146_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11726,13 +11728,13 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11755,12 +11757,12 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB147_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11776,53 +11778,53 @@ define void @flat_atomic_fmaximum_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB148_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB148_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11862,13 +11864,13 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11891,12 +11893,12 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB149_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11912,53 +11914,53 @@ define void @flat_atomic_fminimum_v2f16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fminimum_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB150_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB150_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fminimum_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB150_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12013,13 +12015,13 @@ define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12051,44 +12053,44 @@ define void @flat_atomic_fadd_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fadd_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB152_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB152_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -12146,13 +12148,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12180,13 +12182,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB153_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12202,76 +12204,76 @@ define void @flat_atomic_fsub_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fsub_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB154_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB154_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fsub_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB154_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -12316,13 +12318,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12350,13 +12352,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB155_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12372,76 +12374,76 @@ define void @flat_atomic_fmax_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB156_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB156_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmax_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB156_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -12486,13 +12488,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12520,13 +12522,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB157_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12542,76 +12544,76 @@ define void @flat_atomic_fmin_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB158_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB158_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmin_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB158_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -12661,13 +12663,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12695,13 +12697,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB159_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12717,81 +12719,81 @@ define void @flat_atomic_fmaximum_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB160_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB160_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2 -; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB160_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -12841,13 +12843,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12875,13 +12877,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB161_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12897,81 +12899,81 @@ define void @flat_atomic_fminimum_v2bf16_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB162_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2 -; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB162_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2 -; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB162_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -13306,27 +13308,28 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB171_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4 -; GFX90A-NEXT: v_not_b32_e32 v2, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_and_b32_e32 v0, v1, v4 +; GFX90A-NEXT: v_not_b32_e32 v0, v0 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13336,26 +13339,27 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB171_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_bitop3_b32 v0, v1, v4, v1 bitop3:0x3f +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB171_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13375,24 +13379,24 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2 -; GFX90A-NEXT: v_not_b32_e32 v4, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB172_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -13400,20 +13404,20 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB172_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14066,28 +14070,29 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB189_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14097,29 +14102,30 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB189_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4 -; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX950-NEXT: v_sub_u32_e32 v0, v1, v4 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB189_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14139,25 +14145,25 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2 -; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB190_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14168,26 +14174,26 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v3 -; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2 -; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB190_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 @@ -14202,26 +14208,27 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB191_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e64 v0, v1, v4 clamp +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14231,26 +14238,27 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB191_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_sub_u32_e64 v0, v1, v4 clamp +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB191_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14267,20 +14275,20 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB192_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14294,20 +14302,20 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB192_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14335,38 +14343,38 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB193_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: s_cbranch_execz .LBB193_3 ; GFX90A-NEXT: s_branch .LBB193_4 ; GFX90A-NEXT: .LBB193_2: -; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: ; implicit-def: $agpr2_agpr3 ; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword a0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword a1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: buffer_load_dword a2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a3, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(2) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ; use a[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -15062,14 +15070,14 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB201_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB201_6 ; GFX90A-NEXT: .LBB201_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -15126,14 +15134,14 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB201_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB201_6 ; GFX950-NEXT: .LBB201_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -15174,50 +15182,50 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB202_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB202_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_and_b32_e32 v2, v9, v1 -; GFX90A-NEXT: v_and_b32_e32 v3, v8, v0 -; GFX90A-NEXT: v_not_b32_e32 v7, v2 -; GFX90A-NEXT: v_not_b32_e32 v6, v3 -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4 +; GFX90A-NEXT: v_not_b32_e32 v1, v0 +; GFX90A-NEXT: v_not_b32_e32 v0, v8 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB202_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB202_6 ; GFX90A-NEXT: .LBB202_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB202_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX90A-NEXT: v_not_b32_e32 v0, v0 -; GFX90A-NEXT: v_not_b32_e32 v1, v1 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v3, v3 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB202_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -15232,46 +15240,46 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB202_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB202_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_and_b32_e32 v2, v9, v1 -; GFX950-NEXT: v_and_b32_e32 v3, v8, v0 -; GFX950-NEXT: v_not_b32_e32 v7, v2 -; GFX950-NEXT: v_not_b32_e32 v6, v3 -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX950-NEXT: v_and_b32_e32 v8, v2, v4 +; GFX950-NEXT: v_not_b32_e32 v1, v0 +; GFX950-NEXT: v_not_b32_e32 v0, v8 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB202_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB202_6 ; GFX950-NEXT: .LBB202_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB202_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX950-NEXT: v_not_b32_e32 v1, v1 -; GFX950-NEXT: v_not_b32_e32 v0, v0 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_and_b32_e32 v2, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v4, v0, v4 +; GFX950-NEXT: v_not_b32_e32 v3, v2 +; GFX950-NEXT: v_not_b32_e32 v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB202_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -16798,14 +16806,14 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB219_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB219_6 ; GFX90A-NEXT: .LBB219_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -16866,14 +16874,14 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB219_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB219_6 ; GFX950-NEXT: .LBB219_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -16916,52 +16924,52 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB220_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB220_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB220_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB220_6 ; GFX90A-NEXT: .LBB220_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB220_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v0 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v1, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB220_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -16976,52 +16984,52 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB220_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB220_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc -; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB220_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB220_6 ; GFX950-NEXT: .LBB220_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB220_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB220_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -17062,14 +17070,14 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB221_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB221_6 ; GFX90A-NEXT: .LBB221_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -17128,14 +17136,14 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB221_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB221_6 ; GFX950-NEXT: .LBB221_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -17178,50 +17186,50 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB222_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB222_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB222_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB222_6 ; GFX90A-NEXT: .LBB222_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB222_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB222_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -17236,50 +17244,50 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB222_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB222_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB222_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB222_6 ; GFX950-NEXT: .LBB222_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB222_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB222_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -19015,16 +19023,15 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] @@ -19275,16 +19282,15 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] @@ -19494,26 +19500,27 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB247_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB247_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19548,20 +19555,20 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB248_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB248_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19597,26 +19604,27 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB249_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19626,26 +19634,27 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB249_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB249_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -19662,20 +19671,20 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB250_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19689,20 +19698,20 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB250_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19723,29 +19732,29 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19755,30 +19764,30 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB251_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -19795,29 +19804,29 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB252_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB252_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -19825,30 +19834,30 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_pk_max_f16 v3, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_max_f16 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB252_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 @@ -19863,29 +19872,29 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -19895,30 +19904,30 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB253_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -19935,29 +19944,29 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB254_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB254_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -19965,30 +19974,30 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_pk_max_f16 v3, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_min_f16 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB254_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 @@ -20024,13 +20033,13 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20040,26 +20049,27 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB255_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB255_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20076,33 +20086,34 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB256_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB256_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -20110,20 +20121,20 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB256_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20165,13 +20176,13 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20181,26 +20192,27 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB257_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB257_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20217,33 +20229,34 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB258_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_pk_min_f16 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB258_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -20251,20 +20264,20 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB258_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20321,13 +20334,13 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20362,45 +20375,45 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB260_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB260_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -20462,13 +20475,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20498,13 +20511,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB261_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20521,45 +20534,45 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB262_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB262_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -20567,33 +20580,33 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB262_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -20640,13 +20653,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20676,13 +20689,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB263_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20699,45 +20712,45 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB264_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB264_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -20745,33 +20758,33 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB264_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -20818,13 +20831,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -20854,13 +20867,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB265_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -20877,45 +20890,45 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB266_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB266_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -20923,33 +20936,33 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB266_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -21001,13 +21014,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -21037,13 +21050,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB267_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -21060,50 +21073,50 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB268_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB268_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -21111,33 +21124,33 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2 -; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_maximum3_f32 v0, v0, v4, v4 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB268_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 @@ -21189,13 +21202,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -21225,13 +21238,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB269_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -21248,50 +21261,50 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB270_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2 -; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB270_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -21299,33 +21312,33 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2 -; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_minimum3_f32 v0, v0, v4, v4 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB270_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index c3531f16248e9..b6fe0c756a106 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -1062,12 +1062,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1092,12 +1092,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB21_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1114,24 +1114,24 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_a_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1144,23 +1144,23 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_a_v: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: global_load_dword v3, v[0:1], off ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB22_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1197,12 +1197,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1226,12 +1226,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB23_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1248,23 +1248,23 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1277,22 +1277,22 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: global_load_dword v3, v[0:1], off ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB24_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1313,23 +1313,23 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1342,22 +1342,22 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_v: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: global_load_dword v3, v[0:1], off ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB25_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1394,12 +1394,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -1423,12 +1423,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB26_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -1445,24 +1445,24 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_a_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1475,23 +1475,23 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_a_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: global_load_dword v3, v[0:1], off ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB27_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1512,23 +1512,23 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_v_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1541,22 +1541,22 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_v_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: global_load_dword v3, v[0:1], off ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB28_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1576,24 +1576,25 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX90A-NEXT: ;;#ASMSTART @@ -1631,30 +1632,31 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33 -; GFX90A-NEXT: global_load_dword v1, v[4:5], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX90A-NEXT: global_load_dword v1, v[2:3], off ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ; def a34 ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v0 +; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v[4:5], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1693,53 +1695,58 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a32 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a33, v1 ; GFX950-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a34 +; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 @@ -1773,28 +1780,26 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX950-NEXT: v_accvgpr_read_b32 v4, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a33 -; GFX950-NEXT: global_load_dword v1, v[4:5], off -; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v0 -; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX950-NEXT: global_load_dword v1, v[2:3], off +; GFX950-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v3, v1 -; GFX950-NEXT: v_xor_b32_e32 v2, v3, v0 +; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap v1, v[4:5], v[2:3], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB29_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a32, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1833,24 +1838,25 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a32 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -2007,14 +2013,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2040,14 +2046,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB32_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2068,28 +2074,28 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2101,26 +2107,26 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB33_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -2151,14 +2157,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2182,14 +2188,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB34_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2209,27 +2215,27 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2239,26 +2245,26 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB35_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -2276,27 +2282,27 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2306,26 +2312,26 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB36_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -2356,14 +2362,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -2387,14 +2393,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB37_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -2415,28 +2421,28 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2448,26 +2454,26 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX950-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB38_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -2485,27 +2491,27 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2515,26 +2521,26 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB39_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -3893,13 +3899,13 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -3922,12 +3928,12 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB69_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -3946,44 +3952,44 @@ define void @global_atomic_nand_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2 -; GFX90A-NEXT: v_not_b32_e32 v4, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_nand_i32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB70_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4515,13 +4521,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -4546,13 +4552,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB85_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -4571,25 +4577,25 @@ define void @global_atomic_usub_cond_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2 -; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,26 +4605,26 @@ define void @global_atomic_usub_cond_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v3 -; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2 -; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB86_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -4646,12 +4652,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -4674,12 +4680,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB87_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -4695,20 +4701,20 @@ define void @global_atomic_usub_sat_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_usub_sat_i32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4721,20 +4727,20 @@ define void @global_atomic_usub_sat_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_usub_sat_i32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB88_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5000,14 +5006,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB95_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5033,14 +5039,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB95_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5059,26 +5065,26 @@ define void @global_atomic_nand_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB96_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_and_b32_e32 v4, v7, v3 -; GFX90A-NEXT: v_and_b32_e32 v8, v6, v2 -; GFX90A-NEXT: v_not_b32_e32 v5, v4 -; GFX90A-NEXT: v_not_b32_e32 v4, v8 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: v_and_b32_e32 v2, v5, v7 +; GFX90A-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX90A-NEXT: v_not_b32_e32 v3, v2 +; GFX90A-NEXT: v_not_b32_e32 v2, v8 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB96_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5088,26 +5094,26 @@ define void @global_atomic_nand_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB96_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_and_b32_e32 v4, v7, v3 -; GFX950-NEXT: v_and_b32_e32 v8, v6, v2 -; GFX950-NEXT: v_not_b32_e32 v5, v4 -; GFX950-NEXT: v_not_b32_e32 v4, v8 -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, v5, v7 +; GFX950-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX950-NEXT: v_not_b32_e32 v3, v2 +; GFX950-NEXT: v_not_b32_e32 v2, v8 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB96_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -5664,14 +5670,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB111_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5700,14 +5706,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB111_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5726,27 +5732,27 @@ define void @global_atomic_usub_cond_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 -; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB112_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5756,29 +5762,29 @@ define void @global_atomic_usub_cond_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB112_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -5809,14 +5815,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB113_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -5844,14 +5850,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB113_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -5870,26 +5876,26 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 -; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB114_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5899,28 +5905,28 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB114_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -7379,12 +7385,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7407,12 +7413,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB141_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7428,20 +7434,20 @@ define void @global_atomic_fsub_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fsub_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB142_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB142_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7454,20 +7460,20 @@ define void @global_atomic_fsub_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fsub_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB142_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7502,13 +7508,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7533,13 +7539,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB143_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7555,22 +7561,22 @@ define void @global_atomic_fmax_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmax_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB144_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB144_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7583,23 +7589,23 @@ define void @global_atomic_fmax_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fmax_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_max_f16 v4, v2, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB144_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7634,13 +7640,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7665,13 +7671,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB145_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7687,22 +7693,22 @@ define void @global_atomic_fmin_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmin_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB146_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7715,23 +7721,23 @@ define void @global_atomic_fmin_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fmin_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_min_f16 v4, v2, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB146_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7771,13 +7777,13 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7800,12 +7806,12 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB147_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7821,53 +7827,53 @@ define void @global_atomic_fmaximum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmaximum_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB148_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB148_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7907,13 +7913,13 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -7936,12 +7942,12 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB149_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -7957,53 +7963,53 @@ define void @global_atomic_fminimum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fminimum_v2f16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB150_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB150_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_v2f16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB150_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8058,13 +8064,13 @@ define void @global_atomic_fadd_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8096,44 +8102,44 @@ define void @global_atomic_fadd_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB152_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB152_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -8191,13 +8197,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8225,13 +8231,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB153_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8247,76 +8253,76 @@ define void @global_atomic_fsub_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fsub_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB154_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB154_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB154_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -8361,13 +8367,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8395,13 +8401,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB155_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8417,76 +8423,76 @@ define void @global_atomic_fmax_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmax_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB156_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB156_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmax_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB156_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -8531,13 +8537,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8565,13 +8571,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB157_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8587,76 +8593,76 @@ define void @global_atomic_fmin_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmin_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB158_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB158_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmin_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB158_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -8706,13 +8712,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8740,13 +8746,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB159_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8762,81 +8768,81 @@ define void @global_atomic_fmaximum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmaximum_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB160_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off offset:40 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB160_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2 -; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB160_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -8886,13 +8892,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -8920,13 +8926,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB161_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -8942,81 +8948,81 @@ define void @global_atomic_fminimum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fminimum_v2bf16_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB162_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2 -; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 -; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off offset:40 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB162_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_v2bf16_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v4 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2 -; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 -; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB162_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ; use v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -9349,13 +9355,13 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9379,12 +9385,12 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB171_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9400,55 +9406,55 @@ define void @global_atomic_nand_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_nand_i32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v2, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_and_b32_e32 v2, v3, v1 -; GFX90A-NEXT: v_not_b32_e32 v2, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[16:17] offset:40 glc +; GFX90A-NEXT: v_and_b32_e32 v0, v1, v3 +; GFX90A-NEXT: v_not_b32_e32 v0, v0 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB172_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_nand_i32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_bitop3_b32 v4, v5, v2, v5 bitop3:0x3f -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_bitop3_b32 v0, v1, v3, v1 bitop3:0x3f +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB172_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -10078,13 +10084,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -10110,13 +10116,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB189_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -10132,59 +10138,59 @@ define void @global_atomic_usub_cond_i32_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX90A-LABEL: global_atomic_usub_cond_i32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v2, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v1 -; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX90A-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[16:17] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB190_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_usub_cond_i32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v2, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v3, v2 -; GFX950-NEXT: v_sub_u32_e32 v2, v3, v1 -; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 +; GFX950-NEXT: v_sub_u32_e32 v0, v1, v3 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX950-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB190_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -10213,12 +10219,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -10242,12 +10248,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB191_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -10263,54 +10269,54 @@ define void @global_atomic_usub_sat_i32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_usub_sat_i32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v2 clamp -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_sub_u32_e64 v0, v1, v3 clamp +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB192_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_usub_sat_i32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_sub_u32_e64 v4, v5, v2 clamp -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_sub_u32_e64 v0, v1, v3 clamp +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB192_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -10658,14 +10664,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB201_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -10692,14 +10698,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB201_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -10715,60 +10721,60 @@ define void @global_atomic_nand_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_nand_i64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB202_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_and_b32_e32 v2, v9, v1 -; GFX90A-NEXT: v_and_b32_e32 v3, v8, v0 -; GFX90A-NEXT: v_not_b32_e32 v7, v2 -; GFX90A-NEXT: v_not_b32_e32 v6, v3 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, v2, v4 +; GFX90A-NEXT: v_not_b32_e32 v1, v0 +; GFX90A-NEXT: v_not_b32_e32 v0, v7 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB202_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_nand_i64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB202_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_and_b32_e32 v2, v9, v1 -; GFX950-NEXT: v_and_b32_e32 v3, v8, v0 -; GFX950-NEXT: v_not_b32_e32 v7, v2 -; GFX950-NEXT: v_not_b32_e32 v6, v3 -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX950-NEXT: v_and_b32_e32 v7, v2, v4 +; GFX950-NEXT: v_not_b32_e32 v1, v0 +; GFX950-NEXT: v_not_b32_e32 v0, v7 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB202_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -11425,14 +11431,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB219_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -11462,14 +11468,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB219_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11485,64 +11491,64 @@ define void @global_atomic_usub_cond_i64_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX90A-LABEL: global_atomic_usub_cond_i64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB220_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc -; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB220_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_usub_cond_i64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB220_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc -; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB220_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -11574,14 +11580,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB221_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -11610,14 +11616,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB221_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -11633,62 +11639,62 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_usub_sat_i64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB222_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB222_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_usub_sat_i64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB222_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB222_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -13197,12 +13203,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13226,12 +13232,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB249_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13247,54 +13253,54 @@ define void @global_atomic_fsub_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX90A-LABEL: global_atomic_fsub_v2f16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB250_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_v2f16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_pk_add_f16 v0, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB250_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -13324,13 +13330,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13356,13 +13362,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB251_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13378,59 +13384,59 @@ define void @global_atomic_fmax_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX90A-LABEL: global_atomic_fmax_v2f16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0 ; GFX90A-NEXT: .LBB252_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_pk_max_f16 v1, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v1, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB252_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmax_v2f16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v0 ; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_max_f16 v4, v1, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB252_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -13460,13 +13466,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13492,13 +13498,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB253_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13514,59 +13520,59 @@ define void @global_atomic_fmin_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt ; GFX90A-LABEL: global_atomic_fmin_v2f16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0 ; GFX90A-NEXT: .LBB254_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_pk_max_f16 v1, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v1, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB254_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmin_v2f16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v0 ; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_min_f16 v4, v1, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB254_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -13601,13 +13607,13 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13631,12 +13637,12 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB255_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13652,61 +13658,61 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX90A-LABEL: global_atomic_fmaximum_v2f16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB256_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v1 -; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_sdwa v3, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_perm_b32 v4, v3, v4, s8 -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v4, v0, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s8 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB256_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_v2f16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v2, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB256_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -13741,13 +13747,13 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13771,12 +13777,12 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg % ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB257_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -13792,61 +13798,61 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg ; GFX90A-LABEL: global_atomic_fminimum_v2f16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB258_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_min_f16 v3, v5, v1 -; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_sdwa v3, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_perm_b32 v4, v3, v4, s8 -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_pk_min_f16 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v4, v0, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s8 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB258_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_v2f16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v2, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB258_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -13896,13 +13902,13 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -13935,45 +13941,45 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX90A-LABEL: global_atomic_fadd_v2bf16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB260_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v1 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB260_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14033,13 +14039,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14068,13 +14074,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB261_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14090,78 +14096,78 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX90A-LABEL: global_atomic_fsub_v2bf16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB262_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v1 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB262_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_v2bf16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v3 -; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX950-NEXT: v_sub_f32_e32 v3, v3, v1 -; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 -; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX950-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB262_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -14207,13 +14213,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14242,13 +14248,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB263_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14264,78 +14270,78 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX90A-LABEL: global_atomic_fmax_v2bf16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB264_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v1 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB264_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmax_v2bf16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v3 -; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v1 -; GFX950-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 -; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB264_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -14381,13 +14387,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14416,13 +14422,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB265_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14438,78 +14444,78 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p ; GFX90A-LABEL: global_atomic_fmin_v2bf16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB266_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v1 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB266_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmin_v2bf16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v3 -; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX950-NEXT: v_min_f32_e32 v3, v3, v1 -; GFX950-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 -; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB266_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -14560,13 +14566,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14595,13 +14601,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB267_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14617,83 +14623,83 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre ; GFX90A-LABEL: global_atomic_fmaximum_v2bf16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v4, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB268_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_max_f32_e32 v7, v4, v1 -; GFX90A-NEXT: v_max_f32_e32 v8, v6, v3 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v3 -; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v4, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v7, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v8, v6, v5 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v5 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v4, v8, vcc +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB268_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_v2bf16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v3 -; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX950-NEXT: v_maximum3_f32 v3, v3, v1, v1 -; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 -; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_maximum3_f32 v0, v0, v3, v3 +; GFX950-NEXT: v_maximum3_f32 v5, v5, v4, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB268_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -14744,13 +14750,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -14779,13 +14785,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB269_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -14801,83 +14807,83 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre ; GFX90A-LABEL: global_atomic_fminimum_v2bf16_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v4, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB270_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX90A-NEXT: v_min_f32_e32 v7, v4, v1 -; GFX90A-NEXT: v_min_f32_e32 v8, v6, v3 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v3 -; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v4, v1 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v7, v0, v3 +; GFX90A-NEXT: v_min_f32_e32 v8, v6, v5 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v5 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v4, v8, vcc +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 ; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 -; GFX90A-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB270_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_v2bf16_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v3 -; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX950-NEXT: v_minimum3_f32 v3, v3, v1, v1 -; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 -; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_minimum3_f32 v0, v0, v3, v3 +; GFX950-NEXT: v_minimum3_f32 v5, v5, v4, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB270_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index d89b39348ad9a..5c526c78afcd7 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -43,31 +43,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 + ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16 ; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2 - ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24 ; GFX90A-NEXT: renamable $vgpr27 = IMPLICIT_DEF implicit-def $vgpr26 ; GFX90A-NEXT: renamable $vgpr29 = IMPLICIT_DEF implicit-def $vgpr28 + ; GFX90A-NEXT: renamable $vgpr33 = IMPLICIT_DEF implicit-def $vgpr32 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec + ; GFX90A-NEXT: renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr1, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -75,7 +75,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 @@ -88,9 +88,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF @@ -98,32 +98,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr26 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr28 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr29 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr18, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr33 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr18, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr29 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec @@ -131,7 +131,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec @@ -140,15 +140,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec @@ -157,15 +157,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec @@ -174,10 +174,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: @@ -359,7 +359,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec @@ -376,37 +376,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i30) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec @@ -416,29 +416,29 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -459,7 +459,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i37) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec @@ -469,28 +469,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -510,40 +510,39 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc - ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i44) + ; GFX90A-NEXT: renamable $vgpr1, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $vgpr58, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i44) ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr59, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr3, implicit $exec ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc @@ -560,11 +559,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr66_sgpr67, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc @@ -572,27 +571,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr62, $vgpr56, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr57, $vgpr61, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr4, $vgpr5, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr63, $vgpr58 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr62, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr40, $vgpr61, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr4, $vgpr5, $vgpr6, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr58, $vgpr60, $vgpr63, $vgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc @@ -608,7 +607,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -620,26 +619,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr44_sgpr45 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr44_sgpr45, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -657,139 +656,139 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr8 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76) - ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76) + ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr8, implicit $exec + ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr62_sgpr63 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr62_sgpr63, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr12 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr66_sgpr67, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr14, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr13, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr13, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr7, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr3, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr3 = COPY renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr3, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_LSHR_B64 killed renamable $sgpr56_sgpr57, 1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32_vgpr33 = V_LSHRREV_B64_e64 1, $vgpr20_vgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = V_LSHRREV_B64_e64 1, $vgpr24_vgpr25, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr22, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr30 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 @@ -800,9 +799,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF @@ -812,7 +811,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr40_vgpr41 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = COPY renamable $vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7 = COPY renamable $vgpr5, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} @@ -821,62 +820,62 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr28_vgpr29 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr28_vgpr29 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr32_vgpr33 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) + ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 1, $vgpr10, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $vgpr11, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr12_vgpr13, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr12, implicit $exec - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr52_sgpr53, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr24_sgpr25 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -884,122 +883,122 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr6, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.67, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr25, killed $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr15, killed $vgpr3, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr27, killed $vgpr29, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr17, killed $vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr12, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr30, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr16, killed $vgpr19, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr18, killed $vgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr7, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr32, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr14, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $agpr0 = COPY killed renamable $vgpr32, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 1, $vgpr28, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_OR_B32_e32 $vgpr32, $vgpr26, implicit $exec - ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr52, $vgpr24, implicit $exec - ; GFX90A-NEXT: renamable $vgpr34 = V_CNDMASK_B32_e64 0, $vgpr38, 0, 0, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr34, $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr50, $vgpr14, implicit $exec - ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr48, $vgpr18, implicit $exec - ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr36, killed $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $agpr0 = COPY killed renamable $vgpr14, implicit $exec + ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 1, $vgpr32, implicit $exec + ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 $vgpr34, $vgpr28, implicit $exec + ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr54, $vgpr26, implicit $exec + ; GFX90A-NEXT: renamable $vgpr36 = V_CNDMASK_B32_e64 0, $vgpr48, 0, 0, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_OR_B32_e32 $vgpr36, $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr52, $vgpr16, implicit $exec + ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr50, $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr38, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = V_LSHLREV_B64_e64 3, killed $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr4, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr4, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr5, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr33 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr55 = COPY renamable $vgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr33, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr35 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr55 = COPY renamable $vgpr35, implicit $exec + ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr35, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr35, implicit $exec + ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr35, implicit $exec + ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr35, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = COPY renamable $vgpr35, implicit $exec + ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr35, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr35, renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr13 = COPY killed renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr13, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr33, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr16 = COPY killed renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr16, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr35, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr35, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr14_vgpr15, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr35, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr24_sgpr25 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr32 = COPY killed renamable $agpr0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = COPY killed renamable $agpr0, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr54, killed $vgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20 = V_OR_B32_e32 killed $vgpr2, killed $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr21, renamable $vgpr20_vgpr21, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr14, killed $vgpr24, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22 = V_OR_B32_e32 killed $vgpr2, killed $vgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr23, renamable $vgpr22_vgpr23, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.69 bb: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll index 2abd7edade8a1..99f1cce0c9055 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll @@ -16,7 +16,7 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -36,7 +36,7 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offen_rtn(<2 x half> %val, ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -56,7 +56,7 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val, ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -78,7 +78,7 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -103,7 +103,7 @@ define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offset_rtn(<2 x half> ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) @@ -129,7 +129,7 @@ define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offen_rtn(<2 x half> % ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -155,7 +155,7 @@ define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_idxen_rtn(<2 x half> % ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -183,7 +183,7 @@ define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_bothen_rtn(<2 x half> ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 04f8ad8a02303..931a62298812f 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -258,68 +258,59 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 -; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v0, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], 0 offen offset:32 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v0, s[4:7], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v0, s[4:7], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v0, s[4:7], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v0, s[4:7], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v0, s[4:7], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v0, s[4:7], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v0, s[4:7], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v0, s[4:7], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v0, s[4:7], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v0, s[4:7], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16 ; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 ; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224 -; SDAG-GFX942-NEXT: s_nop 0 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_nop 0 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16 -; SDAG-GFX942-NEXT: s_nop 1 -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; SDAG-GFX942-NEXT: s_endpgm @@ -799,68 +790,59 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop ; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32 -; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v0, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], 0 offen offset:32 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v0, s[4:7], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v0, s[4:7], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v0, s[4:7], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v0, s[4:7], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v0, s[4:7], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v0, s[4:7], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v0, s[4:7], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v0, s[4:7], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v0, s[4:7], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v0, s[4:7], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240 +; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16 ; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 ; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x100 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224 -; SDAG-GFX942-NEXT: s_nop 0 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_nop 0 -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16 -; SDAG-GFX942-NEXT: s_nop 1 -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB1_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; SDAG-GFX942-NEXT: s_endpgm @@ -1158,8 +1140,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 ; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX942-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] -; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen +; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen ; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54 ; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; SDAG-GFX942-NEXT: s_mov_b32 s5, s12 @@ -1170,12 +1152,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX942-NEXT: s_mov_b32 s2, s1 ; SDAG-GFX942-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; SDAG-GFX942-NEXT: v_mov_b32_e32 v5, s0 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v1, s0 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen +; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_endpgm ; ; SDAG-GFX1100-LABEL: memcpy_known_small: diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index fccee3da6d77e..5f965ba431ab5 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -1553,13 +1553,13 @@ define void @too_many_args_use_workitem_id_xyz( ; GFX90A-LABEL: too_many_args_use_workitem_id_xyz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX90A-NEXT: v_and_b32_e32 v33, 0x3ff, v31 -; GFX90A-NEXT: global_store_dword v[0:1], v33, off +; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GFX90A-NEXT: v_and_b32_e32 v32, 0x3ff, v31 +; GFX90A-NEXT: global_store_dword v[0:1], v32, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_bfe_u32 v33, v31, 10, 10 +; GFX90A-NEXT: v_bfe_u32 v32, v31, 10, 10 ; GFX90A-NEXT: v_bfe_u32 v31, v31, 20, 10 -; GFX90A-NEXT: global_store_dword v[0:1], v33, off +; GFX90A-NEXT: global_store_dword v[0:1], v32, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dword v[0:1], v31, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1625,7 +1625,7 @@ define void @too_many_args_use_workitem_id_xyz( ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dword v[0:1], v30, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dword v[0:1], v32, off +; GFX90A-NEXT: global_store_dword v[0:1], v33, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll index a13f3513c660e..aede91b76f441 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll @@ -43,25 +43,26 @@ define void @phi_with_alloca_and_divergent_copy_to_reg(ptr addrspace(5) %diverge ; CHECK-LABEL: phi_with_alloca_and_divergent_copy_to_reg: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s6, s32, 6 ; CHECK-NEXT: v_mov_b32_e32 v7, v2 ; CHECK-NEXT: v_mov_b32_e32 v6, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_lshrrev_b32_e64 v2, 6, s32 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 -; CHECK-NEXT: v_lshl_add_u32 v2, v3, 2, v1 -; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; CHECK-NEXT: v_add_u32_e32 v2, 1, v3 -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2 +; CHECK-NEXT: v_add_u32_e32 v8, 1, v3 +; CHECK-NEXT: v_lshl_add_u32 v5, v3, 2, v1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v8 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v3, v4 -; CHECK-NEXT: v_mov_b32_e32 v2, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %done ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dword v[6:7], v0, off ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 0e86a1ac68119..56de9dde7c310 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -120,19 +120,19 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_bfi_b32 v2, v3, -2, -1 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: v_bfi_b32 v0, v1, -2, -1 +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 5c4e25c3120e9..2079c864c29d3 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -8485,27 +8485,27 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB36_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8630,26 +8630,26 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8858,30 +8858,30 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: flat_load_dword v4, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB37_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9010,26 +9010,26 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9243,30 +9243,30 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: flat_load_dword v4, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB38_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9395,26 +9395,26 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10978,13 +10978,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -10993,6 +10992,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB43_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11087,13 +11087,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc @@ -11101,6 +11100,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11289,30 +11289,30 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: flat_load_dword v4, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB44_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11441,28 +11441,28 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12080,36 +12080,36 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -12262,33 +12262,33 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -12533,41 +12533,41 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12717,41 +12717,41 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13001,41 +13001,41 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -13185,41 +13185,41 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14346,34 +14346,34 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX942-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -14495,32 +14495,32 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -15546,41 +15546,41 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15730,43 +15730,43 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16429,18 +16429,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16618,18 +16618,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16819,18 +16819,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17584,12 +17584,11 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc @@ -17598,6 +17597,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB62_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17966,18 +17966,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB64_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18334,18 +18334,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB66_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18796,37 +18796,37 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB68_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -19120,37 +19120,37 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19453,36 +19453,36 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20759,39 +20759,39 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB74_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21409,37 +21409,37 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB76_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: @@ -22045,37 +22045,37 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 53e9468c5d5b6..d07b2b412acef 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6125,29 +6125,29 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -6279,28 +6279,28 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -6519,34 +6519,34 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6676,36 +6676,36 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6929,34 +6929,34 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7086,36 +7086,36 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8481,26 +8481,26 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8599,25 +8599,25 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -9110,34 +9110,34 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9267,38 +9267,38 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9940,36 +9940,36 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB36_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -10122,33 +10122,33 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -10394,41 +10394,41 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB37_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10578,41 +10578,41 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10863,41 +10863,41 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB38_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11047,41 +11047,41 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12649,34 +12649,34 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB42_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12798,32 +12798,32 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -13414,41 +13414,41 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB44_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13598,43 +13598,43 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14261,25 +14261,25 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -14341,23 +14341,23 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -14497,25 +14497,25 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14578,23 +14578,23 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14734,21 +14734,18 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_pk_max_f16 v0, v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_max_f16 v2, v0, v1 @@ -14758,6 +14755,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14829,20 +14827,20 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15707,25 +15705,25 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15788,25 +15786,25 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16263,39 +16261,39 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16441,37 +16439,37 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16710,39 +16708,39 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16889,37 +16887,37 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17156,46 +17154,44 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX942-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v2, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v0, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17350,36 +17346,36 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18962,39 +18958,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19141,39 +19137,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 5ee3ff67aa8a0..4eb8fdb0b0999 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6125,29 +6125,29 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -6279,28 +6279,28 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -6519,34 +6519,34 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6676,36 +6676,36 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6929,34 +6929,34 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7086,36 +7086,36 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -8481,26 +8481,26 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -8599,25 +8599,25 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -9110,34 +9110,34 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9267,38 +9267,38 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9940,36 +9940,36 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB36_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -10122,33 +10122,33 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -10394,41 +10394,41 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB37_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10578,41 +10578,41 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -10863,41 +10863,41 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB38_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11047,41 +11047,41 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -12649,34 +12649,34 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB42_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12798,32 +12798,32 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -13414,41 +13414,41 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB44_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13598,43 +13598,43 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14261,25 +14261,25 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -14341,23 +14341,23 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -14497,25 +14497,25 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14578,23 +14578,23 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14734,21 +14734,18 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_pk_max_f16 v0, v3, v3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_pk_min_f16 v2, v0, v1 @@ -14758,6 +14755,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14829,20 +14827,20 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15707,25 +15705,25 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15788,25 +15786,25 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16263,39 +16261,39 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16441,37 +16439,37 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16710,39 +16708,39 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16889,37 +16887,37 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17156,46 +17154,44 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX942-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v2, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v0, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17350,36 +17346,36 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18962,39 +18958,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19141,39 +19137,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 0e563c26d27ea..7813cb4226dad 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -5918,27 +5918,27 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX942-NEXT: flat_load_dword v4, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6063,26 +6063,26 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6291,30 +6291,30 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: flat_load_dword v4, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6443,26 +6443,26 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6676,30 +6676,30 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: flat_load_dword v4, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6828,26 +6828,26 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8137,13 +8137,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -8152,6 +8151,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8246,13 +8246,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f16_e32 v3, v5, v2 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc @@ -8260,6 +8259,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8722,30 +8722,30 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: flat_load_dword v4, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8874,28 +8874,28 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v4, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9513,36 +9513,36 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16: @@ -9695,33 +9695,33 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16: @@ -9966,41 +9966,41 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -10150,41 +10150,41 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -10434,41 +10434,41 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -10618,41 +10618,41 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -12216,34 +12216,34 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB38_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12365,32 +12365,32 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12979,41 +12979,41 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v5, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB40_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -13163,43 +13163,43 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX90A-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: flat_load_dword v5, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -13820,12 +13820,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 @@ -13833,6 +13832,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB42_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13893,18 +13893,18 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14039,12 +14039,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 @@ -14052,6 +14051,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB43_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14113,18 +14113,18 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14261,27 +14261,25 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX942-NEXT: v_pk_add_f16 v6, v7, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB44_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14349,18 +14347,18 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15158,12 +15156,11 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX942-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 @@ -15171,6 +15168,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15232,12 +15230,11 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX90A-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc @@ -15246,6 +15243,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15682,39 +15680,39 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16: @@ -15860,37 +15858,37 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16: @@ -16129,39 +16127,39 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -16308,37 +16306,37 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -16575,46 +16573,44 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; GFX942-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v2, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v0, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16769,36 +16765,36 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18381,39 +18377,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -18560,39 +18556,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 8a4b2c428e31a..1a1437a25a1fe 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -3626,7 +3626,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB36_2 ; GFX950-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private @@ -3794,7 +3794,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB37_2 ; GFX950-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private @@ -4512,7 +4512,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB44_2 ; GFX950-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private @@ -4680,7 +4680,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB45_2 ; GFX950-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private @@ -5398,7 +5398,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB52_2 ; GFX950-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private @@ -5566,7 +5566,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB53_2 ; GFX950-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private @@ -13018,21 +13018,20 @@ define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v1 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13184,21 +13183,20 @@ define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 -; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: v_pk_min_f16 v4, v0, v1 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13396,24 +13394,23 @@ define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v7, v0 -; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX950-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0 -; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB130_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13513,24 +13510,23 @@ define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX950-NEXT: .LBB132_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v7, v0 -; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX950-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0 -; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB132_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 3d0ebc72791bd..39fa342d2a66f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -36,37 +36,36 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY6]], 0, 0, implicit $exec :: (load (s64) from %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:av_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY6]], 0, 0, implicit $exec :: (load (s64) from %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.atomicrmw.start: ; GFX90A-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY7]], %bb.0, %3, %bb.1 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.0, %3, %bb.1 ; GFX90A-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI1]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 - ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] - ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY12]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:av_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:av_32 = COPY [[PHI1]].sub1 + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:av_32 = COPY [[PHI1]].sub0 + ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY11]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U64_e64 [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], [[PHI1]], implicit $exec - ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] ; GFX90A-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U64_e64_]], [[PHI]], implicit-def dead $scc ; GFX90A-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.atomicrmw.end: - ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY13]], %bb.1 + ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY12]], %bb.1 ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec - ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 1978e68fdae9c..0a40e62b4ed3a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -8960,27 +8960,27 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX942-NEXT: global_load_dword v4, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB44_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9105,26 +9105,26 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9469,30 +9469,30 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_load_dword v4, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB45_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9621,26 +9621,26 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9993,30 +9993,30 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_load_dword v4, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10145,26 +10145,26 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11972,13 +11972,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_add_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -11987,6 +11986,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12080,13 +12080,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc @@ -12094,6 +12093,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12780,30 +12780,30 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_load_dword v4, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12932,28 +12932,28 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13852,36 +13852,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -14034,33 +14034,33 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -14450,41 +14450,41 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14634,41 +14634,41 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15068,41 +15068,41 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15252,41 +15252,41 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17409,34 +17409,34 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX942-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -17557,32 +17557,32 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -18417,41 +18417,41 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB62_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18601,43 +18601,43 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB62_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -21397,18 +21397,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22365,18 +22365,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB76_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22963,37 +22963,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -23351,37 +23351,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -23741,37 +23741,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB80_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -25278,39 +25278,39 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB84_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -26052,37 +26052,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: @@ -26816,37 +26816,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -27580,37 +27580,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB90_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index dc995fb7ef79c..dec3dd79f60d1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4548,29 +4548,29 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4702,28 +4702,28 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4992,34 +4992,34 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5149,36 +5149,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5454,34 +5454,34 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5611,36 +5611,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7207,26 +7207,26 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7324,25 +7324,25 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7912,34 +7912,34 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8069,38 +8069,38 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8844,36 +8844,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB36_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9026,33 +9026,33 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9349,41 +9349,41 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB37_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9533,41 +9533,41 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9871,41 +9871,41 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB38_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10055,41 +10055,41 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11862,34 +11862,34 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB42_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12010,32 +12010,32 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12705,41 +12705,41 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB44_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12889,43 +12889,43 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13656,25 +13656,25 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13736,23 +13736,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13949,25 +13949,25 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14029,23 +14029,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14244,25 +14244,25 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14324,23 +14324,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15404,25 +15404,25 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15484,25 +15484,25 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16067,39 +16067,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16245,37 +16245,37 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16567,39 +16567,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16745,37 +16745,37 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17069,39 +17069,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17247,37 +17247,37 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -19040,39 +19040,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19218,39 +19218,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index f62e13a9d4341..ce087737b852a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4548,29 +4548,29 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4702,28 +4702,28 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4992,34 +4992,34 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5149,36 +5149,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5454,34 +5454,34 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5611,36 +5611,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7207,26 +7207,26 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7324,25 +7324,25 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7912,34 +7912,34 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8069,38 +8069,38 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8844,36 +8844,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB36_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9026,33 +9026,33 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9349,41 +9349,41 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB37_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9533,41 +9533,41 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9871,41 +9871,41 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB38_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10055,41 +10055,41 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -11862,34 +11862,34 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB42_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12010,32 +12010,32 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12705,41 +12705,41 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB44_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -12889,43 +12889,43 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13656,25 +13656,25 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB46_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13736,23 +13736,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: @@ -13949,25 +13949,25 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB47_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14029,23 +14029,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14244,25 +14244,25 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -14324,23 +14324,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15404,25 +15404,25 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -15484,25 +15484,25 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16067,39 +16067,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16245,37 +16245,37 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: @@ -16567,39 +16567,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -16745,37 +16745,37 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -17069,39 +17069,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -17247,37 +17247,37 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -19040,39 +19040,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -19218,39 +19218,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 9e6f0fd7f13b5..f72296b68bea2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -5290,27 +5290,27 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX942-NEXT: global_load_dword v4, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB22_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5435,26 +5435,26 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5713,30 +5713,30 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_load_dword v4, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB23_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5865,26 +5865,26 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6150,30 +6150,30 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_load_dword v4, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6302,26 +6302,26 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7812,13 +7812,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -7827,6 +7826,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7920,13 +7920,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f16_e32 v3, v5, v2 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc @@ -7934,6 +7933,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8473,30 +8473,30 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX942-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_load_dword v4, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v4 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8625,28 +8625,28 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9366,36 +9366,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16: @@ -9548,33 +9548,33 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_bf16: @@ -9869,41 +9869,41 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -10053,41 +10053,41 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -10389,41 +10389,41 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB34_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -10573,41 +10573,41 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -12372,34 +12372,34 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 ; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB38_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12520,32 +12520,32 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -13211,41 +13211,41 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v5, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX942-NEXT: v_not_b32_e32 v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v5, v5 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX942-NEXT: s_movk_i32 s2, 0x7fff ; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB40_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -13395,43 +13395,43 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -14154,12 +14154,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 @@ -14167,6 +14166,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB42_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14227,18 +14227,18 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-LABEL: global_agent_atomic_fsub_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14430,12 +14430,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 @@ -14443,6 +14442,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB43_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14503,18 +14503,18 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14708,12 +14708,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 @@ -14721,6 +14720,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB44_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14781,18 +14781,18 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15794,12 +15794,11 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX942-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 @@ -15807,6 +15806,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15867,12 +15867,11 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -15881,6 +15880,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16425,39 +16425,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16: @@ -16603,37 +16603,37 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_v2bf16: @@ -16925,39 +16925,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -17103,37 +17103,37 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -17427,39 +17427,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -17605,37 +17605,37 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: @@ -19398,39 +19398,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: @@ -19576,39 +19576,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll index 57bfd2490f9da..d973f7b71fb6d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -19,11 +19,11 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; GFX90A-LABEL: half8: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX90A-NEXT: global_load_dwordx4 v[2:5], v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half8: @@ -85,11 +85,11 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; GFX90A-LABEL: half6: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] +; GFX90A-NEXT: global_load_dwordx3 v[2:4], v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] +; GFX90A-NEXT: global_store_dwordx3 v0, v[2:4], s[2:3] ; GFX90A-NEXT: s_endpgm ; ; GFX1030-LABEL: half6: diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 24512f2f7905a..e1a9935b790f3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -162,12 +162,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX90A-LABEL: atomic_nand_i32_global: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_bfi_b32 v2, v3, -5, -1 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc @@ -176,6 +175,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll index d23509b5aa812..f99718de97765 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -49,10 +49,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_barrier -; GFX90A-NEXT: ds_read_b32 v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ds_read_b32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: barrier_release: @@ -72,10 +72,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_barrier ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-LABEL: barrier_release: @@ -94,10 +94,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_barrier -; GFX942-NEXT: ds_read_b32 v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: ds_read_b32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: barrier_release: @@ -117,10 +117,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: s_barrier ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 -; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX10WGP-LABEL: barrier_release: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll index 3e96dfe40f745..a57b43a81205b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll @@ -37,11 +37,11 @@ entry: define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX950-SDAG-LABEL: ds_read_b96_tr_b6: ; GFX950-SDAG: ; %bb.0: ; %entry -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32 +; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[4:6], v0 offset:32 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off +; GFX950-SDAG-NEXT: global_store_dwordx3 v[2:3], v[4:6], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: ds_read_b96_tr_b6: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index e174fc17e98fe..1e6aea593065c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -159,100 +159,100 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v3, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v3 -; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48 +; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v3 +; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] -; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304 -; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288 -; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272 -; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256 -; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240 -; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224 -; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208 -; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:8192 ; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] -; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] -; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[0:3] +; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[128:131] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-NEXT: s_endpgm entry: call void @llvm.amdgcn.iglp.opt(i32 1) @@ -294,17 +294,17 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b32 v1, v1 +; GCN-NEXT: ds_read_b32 v2, v1 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b32 v0, v1 +; GCN-NEXT: ds_write_b32 v0, v2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: ds_read_b32 v0, v2 offset:256 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: ds_read_b32 v1, v1 offset:256 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b32 v1, v0 offset:256 +; GCN-NEXT: ds_write_b32 v0, v1 offset:256 ; GCN-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll index bc72687e260e7..ff3de9e05e897 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -26,7 +26,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0 ; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1 ; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4 -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112 @@ -48,7 +47,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -71,7 +69,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -94,7 +91,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -117,7 +113,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -130,6 +125,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: s_endpgm ; ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: @@ -495,8 +495,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288 @@ -520,7 +520,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3 -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 @@ -543,7 +542,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -566,7 +564,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0) ; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) -; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 @@ -579,6 +576,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:32784 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:32768 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) +; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: s_endpgm ; ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index b65a1a8e06c7d..19bbfe1cdd9bb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -636,48 +636,48 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32 ; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48 ; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3 -; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192 -; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688 -; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672 -; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656 -; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640 -; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624 -; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608 -; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592 -; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576 -; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264 -; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248 -; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232 -; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216 -; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200 -; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184 -; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168 -; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152 -; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:8304 +; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:8288 +; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:8272 +; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:8256 +; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:8240 +; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:8224 +; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:8208 +; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:8192 +; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:24608 +; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:24576 +; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:49264 +; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:49248 +; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:49232 +; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:49216 +; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:49200 +; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:49184 +; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:49168 +; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:49152 +; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(14) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(8) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; GCN-NEXT: s_nop 11 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 @@ -688,38 +688,38 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[128:131] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0) ; GCN-NEXT: s_endpgm @@ -742,48 +742,48 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32 ; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576 -; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152 -; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:8192 +; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:24576 +; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v3 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v3 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v3 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v3 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v3 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v3 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v3 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v3 offset:49152 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v4 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v4 offset:57392 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159] ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127] -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95] +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63] +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31] ; EXACTCUTOFF-NEXT: s_nop 11 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 @@ -794,38 +794,38 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16400 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -1202,57 +1202,57 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_add_u32_e32 v1, s6, v0 -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[96:99], v1 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:48 -; GCN-NEXT: v_mov_b32_e32 v9, 1.0 +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v1 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 +; GCN-NEXT: v_mul_f32_e32 v9, s1, v3 +; GCN-NEXT: v_mov_b32_e32 v12, 1.0 ; GCN-NEXT: v_ldexp_f32 v4, v4, v5 ; GCN-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; GCN-NEXT: v_mul_f32_e32 v10, s1, v3 +; GCN-NEXT: v_rndne_f32_e32 v10, v9 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; GCN-NEXT: v_rndne_f32_e32 v11, v10 +; GCN-NEXT: v_sub_f32_e32 v11, v9, v10 +; GCN-NEXT: v_fma_f32 v9, s1, v3, -v9 ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 ; GCN-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fma_f32 v10, s1, v3, -v10 +; GCN-NEXT: v_fmac_f32_e32 v9, s1, v7 ; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_fmac_f32_e32 v10, s1, v7 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; GCN-NEXT: v_add_f32_e32 v9, v11, v9 +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 ; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127] -; GCN-NEXT: v_add_f32_e32 v4, v12, v10 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; GCN-NEXT: v_ldexp_f32 v4, v4, v10 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v12, v4, a[128:159] +; GCN-NEXT: v_exp_f32_e32 v4, v9 +; GCN-NEXT: v_cvt_i32_f32_e32 v9, v10 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; GCN-NEXT: v_ldexp_f32 v4, v4, v9 +; GCN-NEXT: v_mul_f32_e32 v9, s2, v3 +; GCN-NEXT: v_rndne_f32_e32 v10, v9 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 +; GCN-NEXT: v_sub_f32_e32 v11, v9, v10 +; GCN-NEXT: v_fma_f32 v9, s2, v3, -v9 ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 +; GCN-NEXT: v_fmac_f32_e32 v9, s2, v7 ; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_mul_f32_e32 v10, s2, v3 -; GCN-NEXT: v_rndne_f32_e32 v11, v10 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] -; GCN-NEXT: v_fma_f32 v4, s2, v3, -v10 -; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fmac_f32_e32 v4, s2, v7 -; GCN-NEXT: v_add_f32_e32 v4, v12, v4 -; GCN-NEXT: v_exp_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 +; GCN-NEXT: v_add_f32_e32 v9, v11, v9 ; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v12, v4, a[96:127] +; GCN-NEXT: v_exp_f32_e32 v4, v9 +; GCN-NEXT: v_cvt_i32_f32_e32 v9, v10 ; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 ; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 ; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 @@ -1269,60 +1269,60 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 ; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 ; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; GCN-NEXT: v_ldexp_f32 v1, v4, v10 +; GCN-NEXT: v_ldexp_f32 v1, v4, v9 +; GCN-NEXT: v_mul_f32_e32 v4, s3, v3 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; GCN-NEXT: v_rndne_f32_e32 v9, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; GCN-NEXT: v_mul_f32_e32 v4, s3, v3 +; GCN-NEXT: v_sub_f32_e32 v10, v4, v9 +; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_rndne_f32_e32 v10, v4 +; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7 ; GCN-NEXT: s_load_dword s8, s[4:5], 0x54 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] -; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 -; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7 -; GCN-NEXT: v_add_f32_e32 v1, v1, v4 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v12, v1, a[64:95] +; GCN-NEXT: v_add_f32_e32 v1, v10, v4 ; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v10 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v9 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440 +; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 ; GCN-NEXT: v_ldexp_f32 v1, v1, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_mul_f32_e32 v4, s8, v3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GCN-NEXT: v_rndne_f32_e32 v9, v4 ; GCN-NEXT: v_fma_f32 v3, s8, v3, -v4 -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63] -; GCN-NEXT: v_rndne_f32_e32 v1, v4 -; GCN-NEXT: v_sub_f32_e32 v10, v4, v1 ; GCN-NEXT: v_fmac_f32_e32 v3, s8, v7 -; GCN-NEXT: v_add_f32_e32 v3, v10, v3 -; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 -; GCN-NEXT: v_ldexp_f32 v1, v3, v1 +; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v12, v1, a[32:63] +; GCN-NEXT: v_sub_f32_e32 v1, v4, v9 +; GCN-NEXT: v_add_f32_e32 v1, v1, v3 +; GCN-NEXT: v_exp_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v9 +; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392 +; GCN-NEXT: v_ldexp_f32 v1, v1, v3 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_add_u32_e32 v0, s7, v0 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[96:99] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v12, v1, a[0:31] +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[128:131] ; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1335,14 +1335,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 ; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 ; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 ; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 @@ -1359,14 +1359,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 ; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 ; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: @@ -1387,57 +1387,57 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:48 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v9, 1.0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v9, s1, v3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v12, 1.0 ; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v5 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s1, v3 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v9 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x42b17218 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v11, v9, v10 +; EXACTCUTOFF-NEXT: v_fma_f32 v9, s1, v3, -v9 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fma_f32 v10, s1, v3, -v10 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v9, s1, v7 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s1, v7 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v9, v11, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:8304 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127] -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v10 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v10 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v12, v4, a[128:159] +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v9 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v10 +; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v9 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v9, s2, v3 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v9 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v11, v9, v10 +; EXACTCUTOFF-NEXT: v_fma_f32 v9, s2, v3, -v9 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v9, s2, v7 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s2, v3 -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s2, v3, -v10 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s2, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v4 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v9, v11, v9 ; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v12, v4, a[96:127] +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v9 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v10 ; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672 ; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656 ; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640 @@ -1454,60 +1454,60 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:49184 ; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:49168 ; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:49152 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v10 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v9 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v4 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3 +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v9 +; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7 ; EXACTCUTOFF-NEXT: s_load_dword s8, s[4:5], 0x54 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v4 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v12, v1, a[64:95] +; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v10, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v10 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v9 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v2 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v2 offset:57440 ; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v4 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3 +; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v4 ; EXACTCUTOFF-NEXT: v_fma_f32 v3, s8, v3, -v4 -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63] -; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v1, v4 -; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v1 ; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s8, v7 -; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3 -; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3 -; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v1, v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 -; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v3, v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v2 offset:57424 +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v12, v1, a[32:63] +; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v9 +; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v3 +; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 +; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v3, v9 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v2 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v2 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v2 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v2 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v2 offset:57392 +; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v3 ; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] +; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v12, v1, a[0:31] +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s7 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1520,14 +1520,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:8208 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448 @@ -1544,14 +1544,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 ; EXACTCUTOFF-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 1d08097452ce6..347fddbedb0a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -20,19 +20,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; SDAG-NEXT: v_mov_b32_e32 v5, s16 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: @@ -518,19 +518,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v5, s16 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -797,12 +797,12 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 @@ -811,12 +811,12 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 ; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: @@ -1308,12 +1308,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 @@ -1322,12 +1322,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 ; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: @@ -1470,12 +1470,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 @@ -1484,12 +1484,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 ; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: @@ -1632,12 +1632,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 @@ -1646,12 +1646,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 ; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: @@ -1794,12 +1794,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 @@ -1808,12 +1808,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 ; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 84db54c2d537f..4eeb98bc11ea3 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -76,7 +76,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt +; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt ; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX942-SDAG-NEXT: s_mov_b32 s5, s12 @@ -87,9 +87,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX942-SDAG-NEXT: s_mov_b32 s2, s1 ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt +; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt ; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: buffer_nontemporal_load_store: @@ -357,7 +357,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30 ; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 @@ -369,8 +369,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX942-SDAG-NEXT: s_mov_b32 s2, s1 ; GFX942-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll index deb97a9812b42..053cf0e1c6906 100644 --- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll +++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll @@ -236,8 +236,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB8_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] +; GFX942-NEXT: global_load_dwordx4 v[16:19], v16, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v19 ; GFX942-NEXT: v_lshrrev_b32_e32 v14, 16, v19 diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index 1e042d3b4a31f..41ffd01fc7e23 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -12,19 +12,19 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) ; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX942-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[12:13], v[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[10:11], v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, s4 -; GFX942-NEXT: v_mov_b32_e32 v13, s5 +; GFX942-NEXT: v_mov_b32_e32 v8, s4 +; GFX942-NEXT: v_mov_b32_e32 v9, s5 ; GFX942-NEXT: v_mov_b32_e32 v4, s6 ; GFX942-NEXT: v_mov_b32_e32 v5, s7 ; GFX942-NEXT: v_mov_b32_e32 v6, s7 ; GFX942-NEXT: v_mov_b32_e32 v7, s7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[12:13], v[4:7], v13 +; GFX942-NEXT: v_smfmac_i32_16x16x64_i8 v[10:13], v[8:9], v[4:7], v9 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dword v0, v11, s[2:3] offset:12 +; GFX942-NEXT: global_store_dword v0, v13, s[2:3] offset:12 ; GFX942-NEXT: s_endpgm entry: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index d2008be4fd32a..fc32bc644ddcd 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -8,34 +8,33 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; CHECK-NEXT: v_pk_mov_b32 v[46:47], 0, 0 -; CHECK-NEXT: flat_load_dword v42, v[46:47] +; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0 +; CHECK-NEXT: flat_load_dword v42, v[44:45] ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_load_dwordx4 s[64:67], s[34:35], 0x8 ; CHECK-NEXT: s_load_dword s68, s[34:35], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base -; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s68, -1 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_cselect_b32 s5, s9, 0 -; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_cselect_b32 s6, s68, 0 -; CHECK-NEXT: v_mov_b32_e32 v57, s5 -; CHECK-NEXT: s_mov_b32 s5, s4 ; CHECK-NEXT: s_add_u32 s50, s34, 48 -; CHECK-NEXT: v_accvgpr_write_b32 a33, s5 +; CHECK-NEXT: v_mov_b32_e32 v47, s5 +; CHECK-NEXT: s_mov_b32 s5, s4 ; CHECK-NEXT: s_addc_u32 s51, s35, 0 -; CHECK-NEXT: v_accvgpr_write_b32 a32, s4 +; CHECK-NEXT: v_pk_mov_b32 v[62:63], s[4:5], s[4:5] op_sel:[0,1] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b32 s53, s14 -; CHECK-NEXT: v_mov_b32_e32 v56, s6 -; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[64:65], s[64:65] op_sel:[0,1] +; CHECK-NEXT: v_mov_b32_e32 v46, s6 +; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[64:65], s[64:65] op_sel:[0,1] ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51] @@ -48,15 +47,15 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s52, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: v_mov_b32_e32 v62, s66 -; CHECK-NEXT: v_mov_b32_e32 v63, s67 -; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] +; CHECK-NEXT: v_mov_b32_e32 v60, s66 +; CHECK-NEXT: v_mov_b32_e32 v61, s67 +; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[62:63] ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] -; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[58:59] -; CHECK-NEXT: v_mov_b32_e32 v44, 0 -; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000 +; CHECK-NEXT: flat_load_dwordx2 a[32:33], v[56:57] +; CHECK-NEXT: v_mov_b32_e32 v58, 0 +; CHECK-NEXT: v_mov_b32_e32 v59, 0x3ff00000 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51] @@ -65,29 +64,31 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s13, s52 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: flat_store_dwordx2 v[46:47], v[44:45] -; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] +; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[58:59] +; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[62:63] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] -; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc +; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s67 ; CHECK-NEXT: v_mov_b32_e32 v0, s68 ; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42 -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] +; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] +; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61] ; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v58, v0, s[0:3], 0 offen +; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_4 ; CHECK-NEXT: ; %bb.1: ; %LeafBlock5 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v42 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.2: ; %sw.bb17.i.i.i.i -; CHECK-NEXT: v_mov_b32_e32 v44, 1 +; CHECK-NEXT: v_mov_b32_e32 v4, 1 ; CHECK-NEXT: ; %bb.3: ; %Flow ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: .LBB0_4: ; %Flow8 @@ -105,10 +106,10 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ; %bb.7: ; %Flow7 ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v44, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: .LBB0_8: ; %bb.1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; %sw.bb.i.i.i.i.i diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index b045c761436de..5e18b469a4e88 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -6,34 +6,35 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v4 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX942-NEXT: v_mov_b32_e32 v2, 8 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v1, s[0:1] ; GFX942-NEXT: s_mov_b32 s4, 0xff0000 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_or_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX942-NEXT: v_and_or_b32 v3, v3, s4, v5 +; GFX942-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX942-NEXT: v_and_or_b32 v3, v3, s4, v4 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB0_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dword v1, v1, s[2:3] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: global_load_dword v0, v0, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX942-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX942-NEXT: v_and_or_b32 v3, v1, s4, v2 +; GFX942-NEXT: v_and_or_b32 v3, v0, s4, v2 ; GFX942-NEXT: .LBB0_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: global_store_byte_d16_hi v0, v3, s[6:7] offset:2 -; GFX942-NEXT: global_store_short v0, v3, s[6:7] +; GFX942-NEXT: global_store_byte_d16_hi v1, v3, s[6:7] offset:2 +; GFX942-NEXT: global_store_short v1, v3, s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -57,20 +58,21 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dword v2, v1, s[0:1] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB1_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dword v2, v1, s[2:3] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: global_load_dword v2, v0, s[2:3] ; GFX942-NEXT: .LBB1_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dword v0, v2, s[6:7] +; GFX942-NEXT: global_store_dword v1, v2, s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -94,24 +96,25 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB2_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX942-NEXT: .LBB2_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: global_store_byte v2, v1, s[6:7] offset:4 -; GFX942-NEXT: global_store_dword v2, v0, s[6:7] +; GFX942-NEXT: global_store_byte v3, v1, s[6:7] offset:4 +; GFX942-NEXT: global_store_dword v3, v0, s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -135,20 +138,21 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[0:1] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB3_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v0, s[2:3] ; GFX942-NEXT: .LBB3_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7] +; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -172,20 +176,21 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v6, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v6 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB4_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3] ; GFX942-NEXT: .LBB4_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -209,24 +214,25 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v10 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16 ; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v10 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB5_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:16 -; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3] ; GFX942-NEXT: .LBB5_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:16 +; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:16 ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -250,9 +256,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v62, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v62 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240 ; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224 @@ -270,52 +276,53 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32 ; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16 ; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v62 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB6_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[2:3] offset:240 -; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[2:3] offset:224 -; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[2:3] offset:208 -; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[2:3] offset:192 -; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[2:3] offset:176 -; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[2:3] offset:160 -; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:144 -; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] offset:128 -; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:112 -; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[2:3] offset:96 -; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[2:3] offset:80 -; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[2:3] offset:64 -; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[2:3] offset:48 -; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[2:3] offset:32 -; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[2:3] offset:16 -; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[2:3] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX942-NEXT: global_load_dwordx4 v[30:33], v0, s[2:3] offset:240 +; GFX942-NEXT: global_load_dwordx4 v[26:29], v0, s[2:3] offset:224 +; GFX942-NEXT: global_load_dwordx4 v[22:25], v0, s[2:3] offset:208 +; GFX942-NEXT: global_load_dwordx4 v[18:21], v0, s[2:3] offset:192 +; GFX942-NEXT: global_load_dwordx4 v[14:17], v0, s[2:3] offset:176 +; GFX942-NEXT: global_load_dwordx4 v[10:13], v0, s[2:3] offset:160 +; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:144 +; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3] offset:128 +; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3] offset:112 +; GFX942-NEXT: global_load_dwordx4 v[58:61], v0, s[2:3] offset:96 +; GFX942-NEXT: global_load_dwordx4 v[54:57], v0, s[2:3] offset:80 +; GFX942-NEXT: global_load_dwordx4 v[50:53], v0, s[2:3] offset:64 +; GFX942-NEXT: global_load_dwordx4 v[46:49], v0, s[2:3] offset:48 +; GFX942-NEXT: global_load_dwordx4 v[42:45], v0, s[2:3] offset:32 +; GFX942-NEXT: global_load_dwordx4 v[38:41], v0, s[2:3] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[34:37], v0, s[2:3] ; GFX942-NEXT: .LBB6_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:112 +; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] offset:112 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[58:61], s[6:7] offset:96 +; GFX942-NEXT: global_store_dwordx4 v1, v[58:61], s[6:7] offset:96 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[54:57], s[6:7] offset:80 +; GFX942-NEXT: global_store_dwordx4 v1, v[54:57], s[6:7] offset:80 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[50:53], s[6:7] offset:64 +; GFX942-NEXT: global_store_dwordx4 v1, v[50:53], s[6:7] offset:64 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[46:49], s[6:7] offset:48 +; GFX942-NEXT: global_store_dwordx4 v1, v[46:49], s[6:7] offset:48 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[42:45], s[6:7] offset:32 +; GFX942-NEXT: global_store_dwordx4 v1, v[42:45], s[6:7] offset:32 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[38:41], s[6:7] offset:16 +; GFX942-NEXT: global_store_dwordx4 v1, v[38:41], s[6:7] offset:16 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[34:37], s[6:7] -; GFX942-NEXT: global_store_dwordx4 v0, v[30:33], s[6:7] offset:240 -; GFX942-NEXT: global_store_dwordx4 v0, v[26:29], s[6:7] offset:224 -; GFX942-NEXT: global_store_dwordx4 v0, v[22:25], s[6:7] offset:208 -; GFX942-NEXT: global_store_dwordx4 v0, v[18:21], s[6:7] offset:192 -; GFX942-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] offset:176 -; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] offset:160 -; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:144 -; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] offset:128 +; GFX942-NEXT: global_store_dwordx4 v1, v[34:37], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v1, v[30:33], s[6:7] offset:240 +; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[6:7] offset:224 +; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[6:7] offset:208 +; GFX942-NEXT: global_store_dwordx4 v1, v[18:21], s[6:7] offset:192 +; GFX942-NEXT: global_store_dwordx4 v1, v[14:17], s[6:7] offset:176 +; GFX942-NEXT: global_store_dwordx4 v1, v[10:13], s[6:7] offset:160 +; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:144 +; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7] offset:128 ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -400,6 +407,7 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB8_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -457,6 +465,7 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB9_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[10:11] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX942-NEXT: s_waitcnt vmcnt(1) @@ -507,85 +516,86 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-LABEL: v8i8_phi_const: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v16, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v16 -; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v16 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v16 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v4 +; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9] ; GFX942-NEXT: ; implicit-def: $vgpr2 -; GFX942-NEXT: ; implicit-def: $vgpr12 -; GFX942-NEXT: ; implicit-def: $vgpr10 ; GFX942-NEXT: ; implicit-def: $vgpr13 -; GFX942-NEXT: ; implicit-def: $vgpr14 ; GFX942-NEXT: ; implicit-def: $vgpr11 +; GFX942-NEXT: ; implicit-def: $vgpr14 ; GFX942-NEXT: ; implicit-def: $vgpr15 +; GFX942-NEXT: ; implicit-def: $vgpr12 +; GFX942-NEXT: ; implicit-def: $vgpr16 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v0 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX942-NEXT: v_lshrrev_b32_e32 v10, 8, v0 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB10_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v3, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v16 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v4 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v0, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX942-NEXT: v_mov_b32_e32 v4, 8 -; GFX942-NEXT: v_mov_b32_e32 v5, 7 -; GFX942-NEXT: v_mov_b32_e32 v6, 6 -; GFX942-NEXT: v_mov_b32_e32 v1, 5 -; GFX942-NEXT: v_mov_b32_e32 v7, 4 -; GFX942-NEXT: v_mov_b32_e32 v8, 3 -; GFX942-NEXT: v_mov_b32_e32 v9, 2 ; GFX942-NEXT: v_mov_b32_e32 v0, 1 +; GFX942-NEXT: v_mov_b32_e32 v10, 2 +; GFX942-NEXT: v_mov_b32_e32 v9, 3 +; GFX942-NEXT: v_mov_b32_e32 v8, 4 +; GFX942-NEXT: v_mov_b32_e32 v1, 5 +; GFX942-NEXT: v_mov_b32_e32 v7, 6 +; GFX942-NEXT: v_mov_b32_e32 v6, 7 +; GFX942-NEXT: v_mov_b32_e32 v5, 8 ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v16, 24, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX942-NEXT: .LBB10_2: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB10_4 ; GFX942-NEXT: ; %bb.3: ; %bb.2 -; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v9 -; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v10 +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v8 ; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v4 +; GFX942-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v5 ; GFX942-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7 ; GFX942-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[12:13] +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] ; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v12, v9 -; GFX942-NEXT: v_mov_b32_e32 v10, v8 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: v_mov_b32_e32 v13, v10 +; GFX942-NEXT: v_mov_b32_e32 v11, v9 +; GFX942-NEXT: v_mov_b32_e32 v14, v8 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v14, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v15, v4 +; GFX942-NEXT: v_mov_b32_e32 v15, v7 +; GFX942-NEXT: v_mov_b32_e32 v12, v6 +; GFX942-NEXT: v_mov_b32_e32 v16, v5 ; GFX942-NEXT: .LBB10_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v13 +; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14 ; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v15 +; GFX942-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v16 ; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14 +; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v15 ; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15] @@ -617,30 +627,31 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX942-LABEL: v8i8_multi_block: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[8:9] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_4 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_3 ; GFX942-NEXT: ; %bb.2: ; %bb.2 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[12:13] ; GFX942-NEXT: .LBB11_3: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: .LBB11_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15] +; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -858,16 +869,17 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942-LABEL: v8i8_mfma_i8: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB14_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v0, s[10:11] ; GFX942-NEXT: .LBB14_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0 @@ -880,7 +892,7 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13] +; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[12:13] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -908,16 +920,17 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-LABEL: v8i8_mfma_half: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37] +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB15_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39] +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v0, s[38:39] ; GFX942-NEXT: .LBB15_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0 @@ -960,14 +973,14 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112 -; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96 -; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80 -; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64 -; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48 -; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32 -; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41] +; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[40:41] offset:112 +; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[40:41] offset:96 +; GFX942-NEXT: global_store_dwordx4 v1, a[20:23], s[40:41] offset:80 +; GFX942-NEXT: global_store_dwordx4 v1, a[16:19], s[40:41] offset:64 +; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[40:41] offset:48 +; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[40:41] offset:32 +; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[40:41] offset:16 +; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[40:41] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -995,21 +1008,23 @@ define amdgpu_kernel void @v8i8_intrinsic(ptr addrspace(1) %src1, ptr addrspace( ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB16_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX942-NEXT: .LBB16_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1] -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x()