diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 286fbfd373b59..f1c14ff0045a3 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -396,13 +396,15 @@ bool MachineLICMImpl::run(MachineFunction &MF) { LLVM_DEBUG(dbgs() << MF.getName() << " ********\n"); if (PreRegAlloc) { + RegisterClassInfo RegClassInfo; + RegClassInfo.runOnMachineFunction(MF); // Estimate register pressure during pre-regalloc pass. unsigned NumRPS = TRI->getNumRegPressureSets(); RegPressure.resize(NumRPS); llvm::fill(RegPressure, 0); RegLimit.resize(NumRPS); for (unsigned i = 0, e = NumRPS; i != e; ++i) - RegLimit[i] = TRI->getRegPressureSetLimit(MF, i); + RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i); } if (HoistConstLoads) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 666523c88860c..39c5b4d5a4741 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -330,13 +330,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -375,13 +375,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -399,13 +399,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -475,21 +475,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -519,20 +519,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -542,20 +542,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -608,15 +608,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -646,15 +646,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -692,15 +692,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -718,15 +718,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -764,21 +764,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -801,22 +801,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -846,21 +846,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -870,21 +870,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -925,13 +925,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -970,13 +970,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -994,13 +994,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1066,21 +1066,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1112,20 +1112,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1135,20 +1135,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1198,15 +1198,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1236,15 +1236,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1282,15 +1282,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1311,15 +1311,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1352,21 +1352,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1389,22 +1389,22 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1435,21 +1435,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1459,24 +1459,24 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1513,20 +1513,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s16 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX942-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1562,19 +1563,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1588,23 +1590,24 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -1615,23 +1618,24 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -1670,23 +1674,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s16 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_max_f32_e32 v1, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX942-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, s16 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1718,22 +1723,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v1, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, s20 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1743,23 +1749,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1769,23 +1776,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1812,26 +1820,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1854,27 +1863,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1906,27 +1917,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s20 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s20 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 @@ -1937,27 +1949,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s20 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s20 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 @@ -1985,24 +1998,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2025,25 +2040,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v2, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2075,26 +2093,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v10, s20 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2104,26 +2123,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, s20 +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 351502816ae6e..86ace7708e67f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -330,13 +330,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -375,13 +375,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -399,13 +399,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -475,21 +475,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -519,20 +519,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -542,20 +542,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -608,15 +608,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -646,15 +646,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -692,15 +692,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -718,15 +718,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -764,21 +764,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[8:9] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -801,22 +801,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -846,21 +846,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -870,21 +870,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -925,13 +925,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -970,13 +970,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -994,13 +994,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1066,21 +1066,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1112,20 +1112,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1135,20 +1135,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1198,15 +1198,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1236,15 +1236,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1282,15 +1282,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1311,15 +1311,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1352,21 +1352,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[8:9] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1389,22 +1389,22 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1435,21 +1435,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1459,24 +1459,24 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1513,20 +1513,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s16 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, s16 +; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX942-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, s16 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1562,19 +1563,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1588,23 +1590,24 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -1615,23 +1618,24 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -1670,23 +1674,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, s16 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: v_mov_b32_e32 v1, s16 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_max_f32_e32 v1, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX942-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, s16 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1718,22 +1723,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v1, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX90A-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, s20 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1743,23 +1749,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX908-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1769,23 +1776,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1812,26 +1820,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1854,27 +1863,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1906,27 +1917,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s20 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s20 +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 @@ -1937,27 +1949,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s20 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s20 +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 @@ -1985,24 +1998,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2025,25 +2040,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v2, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2075,26 +2093,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v10, s20 +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2104,26 +2123,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, s20 +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 3160e38df5e3f..5687db97b8d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -514,10 +514,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10 ; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18 -; GFX908-NEXT: s_mov_b32 s12, 0 -; GFX908-NEXT: s_mov_b32 s9, s12 +; GFX908-NEXT: s_mov_b32 s10, 0 +; GFX908-NEXT: s_mov_b32 s9, s10 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX908-NEXT: s_sub_i32 s1, 0, s7 @@ -528,40 +528,40 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX908-NEXT: v_readfirstlane_b32 s2, v2 -; GFX908-NEXT: s_mul_i32 s1, s1, s2 -; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1 -; GFX908-NEXT: s_add_i32 s2, s2, s1 -; GFX908-NEXT: s_mul_hi_u32 s1, s6, s2 -; GFX908-NEXT: s_mul_i32 s2, s1, s7 -; GFX908-NEXT: s_sub_i32 s2, s6, s2 -; GFX908-NEXT: s_add_i32 s3, s1, 1 -; GFX908-NEXT: s_sub_i32 s6, s2, s7 -; GFX908-NEXT: s_cmp_ge_u32 s2, s7 -; GFX908-NEXT: s_cselect_b32 s1, s3, s1 -; GFX908-NEXT: s_cselect_b32 s2, s6, s2 -; GFX908-NEXT: s_add_i32 s3, s1, 1 -; GFX908-NEXT: s_cmp_ge_u32 s2, s7 -; GFX908-NEXT: s_cselect_b32 s8, s3, s1 -; GFX908-NEXT: s_lshr_b32 s2, s0, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v2 +; GFX908-NEXT: s_mul_i32 s1, s1, s8 +; GFX908-NEXT: s_mul_hi_u32 s1, s8, s1 +; GFX908-NEXT: s_add_i32 s8, s8, s1 +; GFX908-NEXT: s_mul_hi_u32 s1, s6, s8 +; GFX908-NEXT: s_mul_i32 s8, s1, s7 +; GFX908-NEXT: s_sub_i32 s6, s6, s8 +; GFX908-NEXT: s_add_i32 s11, s1, 1 +; GFX908-NEXT: s_sub_i32 s8, s6, s7 +; GFX908-NEXT: s_cmp_ge_u32 s6, s7 +; GFX908-NEXT: s_cselect_b32 s1, s11, s1 +; GFX908-NEXT: s_cselect_b32 s6, s8, s6 +; GFX908-NEXT: s_add_i32 s8, s1, 1 +; GFX908-NEXT: s_cmp_ge_u32 s6, s7 +; GFX908-NEXT: s_cselect_b32 s8, s8, s1 +; GFX908-NEXT: s_lshr_b32 s11, s0, 16 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[8:9], 5 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s11 ; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[2:3], 5 ; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_or_b32 s14, s14, 28 -; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 +; GFX908-NEXT: s_or_b32 s12, s12, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s2, v16 -; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX908-NEXT: s_mul_i32 s3, s5, s2 -; GFX908-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX908-NEXT: s_mul_i32 s2, s4, s2 -; GFX908-NEXT: s_add_i32 s3, s5, s3 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 +; GFX908-NEXT: v_readfirstlane_b32 s9, v16 +; GFX908-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX908-NEXT: s_mul_i32 s5, s5, s9 +; GFX908-NEXT: s_mul_hi_u32 s11, s4, s9 +; GFX908-NEXT: s_mul_i32 s4, s4, s9 +; GFX908-NEXT: s_add_i32 s5, s11, s5 +; GFX908-NEXT: s_lshl_b64 s[4:5], s[4:5], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 @@ -572,31 +572,29 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 -; GFX908-NEXT: s_mov_b32 s13, s12 -; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX908-NEXT: v_mov_b32_e32 v4, s12 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v6, s12 -; GFX908-NEXT: v_mov_b32_e32 v8, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s13 -; GFX908-NEXT: v_mov_b32_e32 v7, s13 -; GFX908-NEXT: v_mov_b32_e32 v9, s13 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 +; GFX908-NEXT: s_mov_b32 s11, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, s10 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: v_mov_b32_e32 v8, s10 +; GFX908-NEXT: v_mov_b32_e32 v5, s11 +; GFX908-NEXT: v_mov_b32_e32 v7, s11 +; GFX908-NEXT: v_mov_b32_e32 v9, s11 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[2:3], 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[18:19], s[2:3], -1 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] +; GFX908-NEXT: s_mov_b64 s[20:21], s[12:13] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s9, v2 -; GFX908-NEXT: v_readfirstlane_b32 s13, v3 +; GFX908-NEXT: v_readfirstlane_b32 s11, v3 ; GFX908-NEXT: s_add_u32 s9, s9, 1 -; GFX908-NEXT: s_addc_u32 s13, s13, 0 +; GFX908-NEXT: s_addc_u32 s11, s11, 0 ; GFX908-NEXT: s_mul_hi_u32 s22, s6, s9 -; GFX908-NEXT: s_mul_i32 s13, s6, s13 +; GFX908-NEXT: s_mul_i32 s11, s6, s11 ; GFX908-NEXT: s_mul_i32 s23, s7, s9 -; GFX908-NEXT: s_add_i32 s13, s22, s13 +; GFX908-NEXT: s_add_i32 s11, s22, s11 ; GFX908-NEXT: s_mul_i32 s9, s6, s9 -; GFX908-NEXT: s_add_i32 s13, s13, s23 +; GFX908-NEXT: s_add_i32 s11, s11, s23 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 @@ -612,7 +610,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: s_add_u32 s22, s20, s9 -; GFX908-NEXT: s_addc_u32 s23, s21, s13 +; GFX908-NEXT: s_addc_u32 s23, s21, s11 ; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc @@ -623,7 +621,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v19 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 -; GFX908-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 @@ -650,7 +648,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19] +; GFX908-NEXT: s_mov_b64 s[22:23], s[16:17] ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -662,16 +660,16 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1 ; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[2:3], -1 +; GFX908-NEXT: s_mov_b64 s[16:17], -1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s10, s10, s8 -; GFX908-NEXT: s_addc_u32 s11, s11, 0 -; GFX908-NEXT: s_add_u32 s14, s14, s16 -; GFX908-NEXT: s_addc_u32 s15, s15, s17 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: s_add_u32 s2, s2, s8 +; GFX908-NEXT: s_addc_u32 s3, s3, 0 +; GFX908-NEXT: s_add_u32 s12, s12, s14 +; GFX908-NEXT: s_addc_u32 s13, s13, s15 +; GFX908-NEXT: s_mov_b64 s[16:17], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm @@ -680,10 +678,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10 ; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18 -; GFX90A-NEXT: s_mov_b32 s12, 0 -; GFX90A-NEXT: s_mov_b32 s9, s12 +; GFX90A-NEXT: s_mov_b32 s10, 0 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX90A-NEXT: s_sub_i32 s1, 0, s7 @@ -693,40 +691,40 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX90A-NEXT: v_readfirstlane_b32 s2, v3 -; GFX90A-NEXT: s_mul_i32 s1, s1, s2 -; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1 -; GFX90A-NEXT: s_add_i32 s2, s2, s1 -; GFX90A-NEXT: s_mul_hi_u32 s1, s6, s2 -; GFX90A-NEXT: s_mul_i32 s2, s1, s7 -; GFX90A-NEXT: s_sub_i32 s2, s6, s2 -; GFX90A-NEXT: s_add_i32 s3, s1, 1 -; GFX90A-NEXT: s_sub_i32 s6, s2, s7 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 -; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 -; GFX90A-NEXT: s_cselect_b32 s2, s6, s2 -; GFX90A-NEXT: s_add_i32 s3, s1, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 -; GFX90A-NEXT: s_cselect_b32 s8, s3, s1 -; GFX90A-NEXT: s_lshr_b32 s2, s0, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v3 +; GFX90A-NEXT: s_mul_i32 s1, s1, s8 +; GFX90A-NEXT: s_mul_hi_u32 s1, s8, s1 +; GFX90A-NEXT: s_add_i32 s8, s8, s1 +; GFX90A-NEXT: s_mul_hi_u32 s1, s6, s8 +; GFX90A-NEXT: s_mul_i32 s8, s1, s7 +; GFX90A-NEXT: s_sub_i32 s6, s6, s8 +; GFX90A-NEXT: s_add_i32 s11, s1, 1 +; GFX90A-NEXT: s_sub_i32 s8, s6, s7 +; GFX90A-NEXT: s_cmp_ge_u32 s6, s7 +; GFX90A-NEXT: s_cselect_b32 s1, s11, s1 +; GFX90A-NEXT: s_cselect_b32 s6, s8, s6 +; GFX90A-NEXT: s_add_i32 s8, s1, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s6, s7 +; GFX90A-NEXT: s_cselect_b32 s8, s8, s1 +; GFX90A-NEXT: s_lshr_b32 s11, s0, 16 +; GFX90A-NEXT: s_lshl_b64 s[14:15], s[8:9], 5 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s11 ; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[2:3], 5 ; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_or_b32 s14, s14, 28 -; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 +; GFX90A-NEXT: s_or_b32 s12, s12, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s2, v18 -; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX90A-NEXT: s_mul_i32 s3, s5, s2 -; GFX90A-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX90A-NEXT: s_mul_i32 s2, s4, s2 -; GFX90A-NEXT: s_add_i32 s3, s5, s3 -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v18 +; GFX90A-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX90A-NEXT: s_mul_i32 s5, s5, s9 +; GFX90A-NEXT: s_mul_hi_u32 s11, s4, s9 +; GFX90A-NEXT: s_mul_i32 s4, s4, s9 +; GFX90A-NEXT: s_add_i32 s5, s11, s5 +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[4:5], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 @@ -737,27 +735,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 -; GFX90A-NEXT: s_mov_b32 s13, s12 -; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8 -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[2:3], 0 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[18:19], s[2:3], -1 +; GFX90A-NEXT: s_mov_b64 s[20:21], s[12:13] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s13, v5 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v5 ; GFX90A-NEXT: s_add_u32 s9, s9, 1 -; GFX90A-NEXT: s_addc_u32 s13, s13, 0 +; GFX90A-NEXT: s_addc_u32 s11, s11, 0 ; GFX90A-NEXT: s_mul_hi_u32 s22, s6, s9 -; GFX90A-NEXT: s_mul_i32 s13, s6, s13 +; GFX90A-NEXT: s_mul_i32 s11, s6, s11 ; GFX90A-NEXT: s_mul_i32 s23, s7, s9 -; GFX90A-NEXT: s_add_i32 s13, s22, s13 +; GFX90A-NEXT: s_add_i32 s11, s22, s11 ; GFX90A-NEXT: s_mul_i32 s9, s6, s9 -; GFX90A-NEXT: s_add_i32 s13, s13, s23 +; GFX90A-NEXT: s_add_i32 s11, s11, s23 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 @@ -773,7 +769,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: s_add_u32 s22, s20, s9 -; GFX90A-NEXT: s_addc_u32 s23, s21, s13 +; GFX90A-NEXT: s_addc_u32 s23, s21, s11 ; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc @@ -784,7 +780,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 @@ -804,7 +800,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[22:23], s[18:19] +; GFX90A-NEXT: s_mov_b64 s[22:23], s[16:17] ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -816,16 +812,16 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1 ; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[2:3], -1 +; GFX90A-NEXT: s_mov_b64 s[16:17], -1 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s10, s10, s8 -; GFX90A-NEXT: s_addc_u32 s11, s11, 0 -; GFX90A-NEXT: s_add_u32 s14, s14, s16 -; GFX90A-NEXT: s_addc_u32 s15, s15, s17 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_add_u32 s2, s2, s8 +; GFX90A-NEXT: s_addc_u32 s3, s3, 0 +; GFX90A-NEXT: s_add_u32 s12, s12, s14 +; GFX90A-NEXT: s_addc_u32 s13, s13, s15 +; GFX90A-NEXT: s_mov_b64 s[16:17], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 0fccdba729132..6fdaaf2fa0a40 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -11089,15 +11089,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX7LESS-NEXT: s_mov_b64 s[8:9], 0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7LESS-NEXT: s_lshl_b32 s11, s6, 16 ; GFX7LESS-NEXT: s_and_b32 s4, s2, -4 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 +; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7LESS-NEXT: s_and_b32 s2, s2, 3 ; GFX7LESS-NEXT: s_lshl_b32 s10, s2, 3 -; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX7LESS-NEXT: v_mul_f32_e64 v0, 1.0, s6 ; GFX7LESS-NEXT: s_lshl_b32 s2, 0xffff, s10 -; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7LESS-NEXT: s_not_b32 s2, s2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 @@ -11105,11 +11103,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX7LESS-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s11 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, s10, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_and_b32_e32 v2, s2, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 @@ -11774,26 +11774,26 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, <2 x half> %val) { ; GFX7LESS-LABEL: uniform_fadd_v2f16: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_mov_b64 s[8:9], 0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_lshr_b32 s4, s6, 16 +; GFX7LESS-NEXT: s_mov_b32 s4, s2 +; GFX7LESS-NEXT: s_mov_b32 s5, s3 +; GFX7LESS-NEXT: s_lshr_b32 s2, s6, 16 ; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, s6 -; GFX7LESS-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GFX7LESS-NEXT: s_lshr_b32 s4, s5, 16 -; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX7LESS-NEXT: s_lshr_b32 s2, s3, 16 +; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v3, s2 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: s_mov_b32 s4, s2 -; GFX7LESS-NEXT: s_mov_b32 s5, s3 ; GFX7LESS-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -12070,56 +12070,56 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val) { ; GFX7LESS-LABEL: uniform_fadd_v2bf16: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7LESS-NEXT: s_mov_b64 s[8:9], 0 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s4, s6, 0xffff0000 -; GFX7LESS-NEXT: s_lshl_b32 s5, s6, 16 -; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0x0 -; GFX7LESS-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; GFX7LESS-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s4, s6, 0xffff0000 -; GFX7LESS-NEXT: s_lshl_b32 s5, s6, 16 -; GFX7LESS-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s5 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_mov_b32 s4, s2 ; GFX7LESS-NEXT: s_mov_b32 s5, s3 +; GFX7LESS-NEXT: s_load_dword s9, s[4:5], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], 0 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_and_b32 s10, s9, 0xffff0000 +; GFX7LESS-NEXT: s_lshl_b32 s11, s9, 16 +; GFX7LESS-NEXT: s_lshl_b32 s9, s6, 16 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s11 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s10 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7LESS-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7LESS-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7LESS-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7LESS-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX7LESS-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; GFX7LESS-NEXT: v_mov_b32_e32 v5, v3 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc +; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s9 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7LESS-NEXT: v_mul_f32_e64 v3, 1.0, s8 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX7LESS-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7LESS-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7LESS-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX7LESS-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX7LESS-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX7LESS-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7LESS-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX7LESS-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7LESS-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll index 535f05bc01b42..6a954284cc681 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll @@ -10,9 +10,7 @@ define amdgpu_kernel void @v_atomicrmw_fadd_bf16(ptr addrspace(1) %out, i1 %in, ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 ; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, -4 @@ -21,7 +19,7 @@ define amdgpu_kernel void @v_atomicrmw_fadd_bf16(ptr addrspace(1) %out, i1 %in, ; GFX11-TRUE16-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, 0xffff, s2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s3 @@ -35,18 +33,19 @@ define amdgpu_kernel void @v_atomicrmw_fadd_bf16(ptr addrspace(1) %out, i1 %in, ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v0, v4, v[0:1], s[0:1] glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 37040123ee20c..ac90ed95b76ff 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -5,23 +5,20 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-LABEL: vgpr_mfma_pass_av_split_crash: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 -; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 -; CHECK-NEXT: v_mov_b32_e32 v20, 0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 -; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 -; CHECK-NEXT: s_xor_b64 s[18:19], s[16:17], -1 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_xor_b64 s[10:11], s[8:9], -1 ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 ; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 -; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 -; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 +; CHECK-NEXT: v_mov_b32_e32 v20, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 +; CHECK-NEXT: s_xor_b64 s[14:15], s[12:13], -1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6 ; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f ; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90 @@ -34,35 +31,33 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_mov_b32_e32 v11, 0xbf8c6ea4 ; CHECK-NEXT: v_mov_b32_e32 v12, 0xe82d3ff0 ; CHECK-NEXT: v_mov_b32_e32 v13, 0xbfa59976 +; CHECK-NEXT: s_mov_b64 s[16:17], 0 ; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883 ; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4 -; CHECK-NEXT: s_mov_b64 s[22:23], 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136 -; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523 ; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555 -; CHECK-NEXT: s_and_b64 s[6:7], exec, s[18:19] ; CHECK-NEXT: v_mov_b32_e32 v21, v20 ; CHECK-NEXT: ; implicit-def: $vgpr30_vgpr31 ; CHECK-NEXT: ; implicit-def: $vgpr22_vgpr23 ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_1: ; %Flow9 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[24:25] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; CHECK-NEXT: s_cbranch_vccz .LBB0_17 ; CHECK-NEXT: .LBB0_2: ; %._crit_edge1942.i.i.i3548 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[12:13] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_9 ; CHECK-NEXT: ; %bb.3: ; %.preheader1868.i.i.i3244 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[4:5] +; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_cbranch_vccz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %.preheader1855.i.i.i3329.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] +; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25] ; CHECK-NEXT: v_accvgpr_read_b32 v27, a3 ; CHECK-NEXT: v_accvgpr_read_b32 v26, a2 @@ -91,73 +86,73 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_and_b64 vcc, exec, s[20:21] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_11 ; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 ; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_accvgpr_read_b32 v29, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v28, a0 -; CHECK-NEXT: s_mov_b64 s[24:25], -1 -; CHECK-NEXT: s_mov_b64 s[8:9], -1 -; CHECK-NEXT: s_mov_b64 vcc, s[2:3] +; CHECK-NEXT: s_mov_b64 s[18:19], -1 +; CHECK-NEXT: s_and_b64 vcc, exec, s[12:13] +; CHECK-NEXT: s_mov_b64 s[20:21], -1 ; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 +; CHECK-NEXT: s_and_b64 vcc, exec, s[10:11] ; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 -; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] -; CHECK-NEXT: s_mov_b64 vcc, s[6:7] +; CHECK-NEXT: s_mov_b64 s[20:21], s[10:11] ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v26 -; CHECK-NEXT: s_mov_b64 s[24:25], 0 +; CHECK-NEXT: s_mov_b64 s[18:19], 0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v27 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: s_mov_b64 s[20:21], 0 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] -; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21] +; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[2:3] +; CHECK-NEXT: s_mov_b64 s[0:1], s[14:15] ; CHECK-NEXT: s_branch .LBB0_15 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 ; CHECK-NEXT: s_branch .LBB0_15 ; CHECK-NEXT: .LBB0_11: ; %loop.exit.guard ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[24:25] +; CHECK-NEXT: s_and_b64 vcc, exec, s[18:19] ; CHECK-NEXT: s_cbranch_vccz .LBB0_13 ; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[28:29] -; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] -; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17] -; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] +; CHECK-NEXT: v_cmp_nlg_f64_e64 s[0:1], 0, v[28:29] +; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[8:9] +; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[8:9] +; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v17, v16 -; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] -; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13] -; CHECK-NEXT: s_cselect_b32 s23, s23, 0 -; CHECK-NEXT: s_cselect_b32 s22, s22, 0 -; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_and_b64 s[0:1], exec, s[8:9] +; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[4:5] +; CHECK-NEXT: s_cselect_b32 s17, s17, 0 +; CHECK-NEXT: s_cselect_b32 s16, s16, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: s_branch .LBB0_14 ; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 ; CHECK-NEXT: .LBB0_14: ; %Flow6 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25] ; CHECK-NEXT: .LBB0_15: ; %Flow6 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_mov_b64 s[24:25], -1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_mov_b64 s[18:19], -1 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB0_1 ; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_mov_b64 s[24:25], 0 -; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13] +; CHECK-NEXT: s_mov_b64 s[18:19], 0 +; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[4:5] ; CHECK-NEXT: s_branch .LBB0_1 ; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 0ceb9019eb990..c5f1c50a2d4fa 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -57,23 +57,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 @@ -98,18 +98,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -125,18 +125,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -152,18 +152,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB0_1 @@ -179,19 +179,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB0_1 @@ -245,23 +245,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 @@ -294,19 +294,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -320,19 +320,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -346,20 +346,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -830,23 +830,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -862,12 +862,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -888,18 +888,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -915,18 +915,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -942,18 +942,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -969,19 +969,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1035,23 +1035,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 @@ -1066,11 +1066,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1091,19 +1091,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1117,19 +1117,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1143,19 +1143,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1169,20 +1169,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1223,25 +1223,24 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1255,23 +1254,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 @@ -1287,12 +1286,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1313,18 +1312,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1340,18 +1339,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1367,18 +1366,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB5_1 @@ -1394,19 +1393,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB5_1 @@ -1448,25 +1447,24 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1480,23 +1478,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -1512,12 +1510,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1538,18 +1536,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 @@ -1565,18 +1563,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 @@ -1592,18 +1590,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 @@ -1619,19 +1617,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 @@ -1673,25 +1671,24 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1705,23 +1702,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1737,12 +1734,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1763,18 +1760,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 @@ -1790,18 +1787,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 @@ -1817,18 +1814,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 @@ -1844,19 +1841,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 @@ -1883,24 +1880,24 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1925,25 +1922,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1958,26 +1955,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2003,21 +2000,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2034,21 +2031,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2065,21 +2062,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2096,22 +2093,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2133,24 +2130,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] -; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_mov_b32_e32 v8, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2174,25 +2170,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2205,26 +2200,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v4 +; GFX10-NEXT: v_mov_b32_e32 v9, v5 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v7, v3 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v4, v7 -; GFX10-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 @@ -2248,22 +2243,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v10, v5 -; GFX908-NEXT: v_mov_b32_e32 v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2277,22 +2272,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, v5 -; GFX8-NEXT: v_mov_b32_e32 v9, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2306,22 +2301,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v10, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2335,23 +2330,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v5 -; GFX6-NEXT: v_mov_b32_e32 v9, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2903,24 +2898,24 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2945,25 +2940,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2978,26 +2973,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3014,18 +3009,18 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v10, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 @@ -3042,21 +3037,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 @@ -3073,21 +3068,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 @@ -3104,21 +3099,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 @@ -3135,22 +3130,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 @@ -3173,24 +3168,24 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -3215,25 +3210,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -3248,26 +3243,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 @@ -3293,21 +3288,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -3324,21 +3319,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -3355,21 +3350,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 @@ -3386,22 +3381,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 @@ -3429,41 +3424,41 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3477,42 +3472,42 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3520,24 +3515,25 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX942-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX942-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX942-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3547,90 +3543,93 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 +; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3638,21 +3637,22 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc @@ -3661,35 +3661,36 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3699,29 +3700,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v5, s4 -; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -3733,30 +3735,31 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_and_b32_e32 v3, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3768,36 +3771,37 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3807,7 +3811,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3815,30 +3819,31 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3848,7 +3853,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3868,40 +3873,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3915,41 +3920,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3957,24 +3962,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 ; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX942-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX942-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX942-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3990,81 +3996,84 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 +; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -4072,57 +4081,59 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -4138,29 +4149,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v3, s4 -; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4171,30 +4183,31 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_and_b32_e32 v3, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4205,34 +4218,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4243,35 +4257,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5096,20 +5111,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -5119,31 +5134,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5155,27 +5170,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -5185,24 +5200,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5210,32 +5224,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, s6 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX942-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5245,28 +5260,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5276,56 +5291,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -5335,95 +5350,97 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5433,38 +5450,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5474,40 +5492,41 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5517,36 +5536,37 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5556,7 +5576,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5564,30 +5584,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5597,7 +5618,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5617,53 +5638,53 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5675,53 +5696,52 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5729,32 +5749,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX942-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5770,175 +5791,177 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5954,36 +5977,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5994,38 +6018,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6036,34 +6061,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6074,35 +6100,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7028,25 +7055,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7060,23 +7086,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -7101,18 +7127,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -7128,20 +7154,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -7164,30 +7190,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 @@ -7210,31 +7236,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 @@ -7277,25 +7303,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7308,23 +7332,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -7357,21 +7381,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7382,41 +7406,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 @@ -7428,42 +7452,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 @@ -8003,25 +8027,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8035,23 +8058,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -8067,12 +8090,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8093,18 +8116,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -8120,20 +8143,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 @@ -8156,30 +8179,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 @@ -8202,31 +8225,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 @@ -8269,25 +8292,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8300,23 +8321,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -8331,11 +8352,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8356,19 +8377,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8382,21 +8403,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8407,41 +8428,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 @@ -8453,42 +8474,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 @@ -8530,25 +8551,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8562,23 +8582,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -8594,12 +8614,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8620,18 +8640,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB24_1 @@ -8647,20 +8667,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB24_1 @@ -8683,30 +8703,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 @@ -8729,31 +8749,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 @@ -8796,25 +8816,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8827,23 +8845,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -8858,11 +8876,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8883,19 +8901,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8909,21 +8927,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8934,41 +8952,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 @@ -8980,42 +8998,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 @@ -9051,41 +9069,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX942-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX942-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 @@ -9096,48 +9114,48 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v5, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -9150,46 +9168,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -9202,41 +9219,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -9247,40 +9264,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 @@ -9291,41 +9308,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 @@ -9336,42 +9353,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 @@ -9383,38 +9400,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9425,39 +9442,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9489,40 +9506,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX942-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9532,45 +9549,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -9583,43 +9602,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -9633,39 +9654,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -9677,39 +9698,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9720,40 +9741,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9764,41 +9785,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9809,38 +9830,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9851,39 +9872,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9934,7 +9955,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -9946,40 +9967,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX942-NEXT: s_movk_i32 s10, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX942-NEXT: s_mov_b32 s11, 0x7060302 ; GFX942-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX942-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX942-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX942-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX942-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX942-NEXT: v_add3_u32 v7, v7, v6, s10 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX942-NEXT: v_perm_b32 v8, v6, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -9992,27 +10013,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB28_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB28_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 @@ -10026,42 +10047,43 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX11-TRUE16-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_add_f32 v4, v4, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v10, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -10075,14 +10097,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -10091,13 +10113,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 @@ -10111,42 +10133,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v10, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-FAKE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -10160,14 +10181,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -10175,15 +10196,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 @@ -10195,38 +10215,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v10, v7 +; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -10238,15 +10258,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -10255,13 +10275,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB28_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -10273,38 +10293,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX90A-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -10316,27 +10336,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -10348,39 +10368,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX908-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX908-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -10392,27 +10412,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB28_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -10424,40 +10444,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX8-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX8-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -10469,21 +10489,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB28_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -10658,41 +10678,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX942-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX942-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB29_1 @@ -10703,48 +10723,48 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v5, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -10757,46 +10777,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -10809,41 +10828,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB29_1 @@ -10854,40 +10873,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 @@ -10898,41 +10917,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB29_1 @@ -10943,42 +10962,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB29_1 @@ -10990,38 +11009,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11032,39 +11051,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11096,40 +11115,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX942-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11139,45 +11158,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -11190,43 +11211,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -11240,39 +11263,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -11284,39 +11307,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11327,40 +11350,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11371,41 +11394,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11416,38 +11439,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11458,39 +11481,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11521,41 +11544,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX942-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX942-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 @@ -11566,48 +11589,48 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v5, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -11620,46 +11643,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -11672,41 +11694,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -11717,40 +11739,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 @@ -11761,41 +11783,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 @@ -11806,42 +11828,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 @@ -11853,38 +11875,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11895,39 +11917,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11959,40 +11981,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX942-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB32_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12002,45 +12024,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -12053,43 +12077,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -12103,39 +12129,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -12147,39 +12173,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12190,40 +12216,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12234,41 +12260,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12279,38 +12305,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12321,39 +12347,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12385,40 +12411,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX942-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX942-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12428,45 +12454,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -12479,43 +12507,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -12529,39 +12559,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -12573,39 +12603,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12616,40 +12646,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12660,41 +12690,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12705,38 +12735,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12747,39 +12777,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12838,23 +12868,23 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB34_1 @@ -12870,12 +12900,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -12898,18 +12928,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 @@ -12925,18 +12955,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB34_1 @@ -12952,18 +12982,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 @@ -12979,19 +13009,19 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index cad4c39eaf39f..d18201d0f37c1 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -34,19 +34,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -85,19 +85,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -113,25 +113,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -148,19 +148,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -211,24 +211,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -261,23 +261,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -288,24 +288,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -320,20 +320,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -402,7 +402,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -414,22 +414,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX942-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX942-NEXT: v_max_f32_e32 v6, v4, v9 +; GFX942-NEXT: v_max_f32_e32 v6, v9, v9 +; GFX942-NEXT: v_max_f32_e32 v8, v6, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -443,21 +443,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB2_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -522,7 +522,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -534,22 +534,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9 +; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9 +; GFX90A-NEXT: v_max_f32_e32 v8, v6, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -561,27 +561,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -593,23 +593,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX908-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v4, v6, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v4, v8 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_max_f32_e32 v6, v8, v8 +; GFX908-NEXT: v_max_f32_e32 v7, v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -621,21 +621,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -774,19 +774,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -803,27 +803,28 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -835,27 +836,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -866,19 +867,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -894,25 +895,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -929,19 +930,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -958,19 +959,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -987,20 +988,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1032,19 +1033,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -1083,19 +1084,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1111,25 +1112,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 @@ -1146,19 +1147,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 @@ -1201,29 +1202,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1246,30 +1246,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1301,29 +1301,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1334,29 +1334,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1396,26 +1396,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1439,27 +1439,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1492,27 +1493,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1523,27 +1524,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1605,17 +1606,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[5:6], v[5:6] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[2:3], v[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1706,17 +1707,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1834,15 +1835,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 @@ -1900,15 +1901,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 @@ -2008,29 +2009,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2053,30 +2053,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2088,31 +2088,31 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2123,26 +2123,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v10, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 @@ -2153,29 +2153,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2186,29 +2186,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2219,29 +2219,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2252,30 +2252,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 -; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2296,29 +2296,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2341,30 +2340,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2396,29 +2395,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 @@ -2429,29 +2428,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 @@ -2495,46 +2494,46 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.l, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,46 +2545,46 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2593,300 +2592,309 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX942-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX942-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 +; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2896,7 +2904,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2904,30 +2912,31 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2937,7 +2946,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2955,45 +2964,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.l, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3005,45 +3014,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3051,31 +3060,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX942-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX942-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3086,154 +3096,159 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 +; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3244,31 +3259,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3279,32 +3295,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3315,34 +3332,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3353,35 +3371,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3401,15 +3420,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3423,29 +3442,28 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v6.l, v6.l, v6.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v6.l, v6.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3460,14 +3478,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3475,7 +3493,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3486,15 +3504,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3508,30 +3526,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v8, v5, v5 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v6, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v6, v8 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v10 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3546,14 +3563,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3561,7 +3578,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3569,12 +3586,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v11, v6 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -3586,24 +3603,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX942-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3617,36 +3634,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v8 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB12_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3658,29 +3675,28 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v6.l, v6.l, v6.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v6.l, v6.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3694,14 +3710,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3710,22 +3726,22 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3737,30 +3753,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v8, v5, v5 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v6, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3774,14 +3789,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3790,20 +3805,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v11, v7 ; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3813,26 +3828,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: v_max_f16_e32 v8, v5, v5 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX10-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -3844,15 +3859,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3861,19 +3876,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 -; GFX90A-NEXT: v_not_b32_e32 v10, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v11, v6 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3885,24 +3900,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3914,33 +3929,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX908-NEXT: v_not_b32_e32 v9, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v11, v6 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3952,25 +3967,25 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX908-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3982,33 +3997,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX8-NEXT: v_not_b32_e32 v9, v4 +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v11, v6 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -4020,26 +4035,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX8-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -4051,21 +4066,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4228,20 +4243,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -4251,31 +4266,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4287,27 +4302,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4317,24 +4332,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4342,32 +4356,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, s6 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX942-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4377,28 +4392,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4408,56 +4423,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4467,95 +4482,97 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4565,38 +4582,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4606,40 +4624,41 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4649,37 +4668,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4689,7 +4709,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4697,31 +4717,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4731,7 +4752,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4751,53 +4772,53 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4809,53 +4830,52 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4863,32 +4883,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX942-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4904,175 +4925,177 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5088,36 +5111,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5128,38 +5152,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5170,35 +5195,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5209,36 +5235,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6145,28 +6172,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v1, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6179,21 +6204,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_pk_max_f16 v4, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6209,28 +6233,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v1, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6242,27 +6265,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v1, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -6273,19 +6296,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6301,25 +6324,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX908-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 @@ -6330,29 +6353,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -6375,30 +6398,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 @@ -6421,31 +6444,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 @@ -6467,25 +6490,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v1, v0, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_max_num_f16 v1, v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6499,25 +6521,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6528,25 +6549,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v1, v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6559,25 +6580,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v1, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -6589,23 +6610,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6616,24 +6637,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v1, v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6644,28 +6665,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v5, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6676,41 +6697,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 @@ -6722,42 +6743,42 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 @@ -6778,7 +6799,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6793,25 +6814,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 +; GFX12-NEXT: v_pk_max_num_f16 v6, v8, v8 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_pk_max_num_f16 v7, v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6826,14 +6847,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6841,14 +6862,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6860,23 +6881,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX942-NEXT: v_pk_max_f16 v6, v9, v9 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_pk_max_f16 v6, v4, v9 +; GFX942-NEXT: v_pk_max_f16 v8, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6889,27 +6910,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB18_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6923,25 +6944,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v6, v8, v8 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_pk_max_f16 v7, v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -6955,14 +6976,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6971,13 +6992,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6989,24 +7010,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX10-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v6, v8, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_pk_max_f16 v7, v6, v4 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -7018,15 +7039,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -7035,13 +7056,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7053,22 +7074,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 -; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 +; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9 +; GFX90A-NEXT: v_pk_max_f16 v8, v6, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7080,27 +7101,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7112,23 +7133,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX908-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 -; GFX908-NEXT: v_pk_max_f16 v5, v4, v8 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_pk_max_f16 v6, v8, v8 +; GFX908-NEXT: v_pk_max_f16 v7, v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7140,27 +7161,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7172,27 +7193,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v7, v8, v8 +; GFX8-NEXT: v_max_f16_e32 v6, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7204,21 +7225,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -7395,48 +7416,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-TRUE16-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v1, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v5, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7453,46 +7472,44 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-FAKE16-NEXT: s_add_co_i32 s6, s16, 0x400 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7505,41 +7522,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX942-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 @@ -7550,48 +7567,48 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v5, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7604,46 +7621,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7656,41 +7672,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -7701,40 +7717,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 @@ -7745,41 +7761,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -7790,42 +7806,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -7837,38 +7853,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7879,39 +7895,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7931,43 +7947,46 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX12-TRUE16-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_max_num_f32 v0, v0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v3, v1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX12-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7984,42 +8003,44 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX12-FAKE16-NEXT: s_add_co_i32 s6, s16, 0x400 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v3, v1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8033,40 +8054,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX942-NEXT: v_max_f32_e32 v2, v5, v4 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX942-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8076,45 +8097,47 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_max_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8127,43 +8150,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8177,39 +8202,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -8221,39 +8246,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8264,40 +8289,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8308,41 +8333,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8354,37 +8379,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8396,38 +8421,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8447,7 +8472,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8462,44 +8487,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-TRUE16-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v8 :: v_dual_max_num_f32 v4, v4, v9 -; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v10, v7 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8514,14 +8540,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8529,7 +8555,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8540,7 +8566,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8555,43 +8581,44 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-FAKE16-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8 -; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v6, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v10, v7 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX12-FAKE16-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8606,14 +8633,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8621,14 +8648,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -8640,40 +8667,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX942-NEXT: s_movk_i32 s10, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX942-NEXT: s_mov_b32 s11, 0x7060302 ; GFX942-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX942-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v10 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX942-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX942-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX942-NEXT: v_add3_u32 v7, v7, v6, s10 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX942-NEXT: v_perm_b32 v8, v6, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -8686,27 +8713,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB21_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -8720,42 +8747,43 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX11-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v8 :: v_dual_max_f32 v4, v4, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v10, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8769,14 +8797,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -8785,13 +8813,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -8805,42 +8833,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v10, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8854,14 +8881,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 ; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -8869,15 +8896,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -8889,38 +8915,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v10, v7 +; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -8932,15 +8958,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -8949,13 +8975,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -8967,38 +8993,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v10 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX90A-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9010,27 +9036,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -9042,39 +9068,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v8 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX908-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v9 -; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX908-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -9086,27 +9112,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -9118,40 +9144,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v8 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX8-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v9 -; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX8-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -9163,21 +9189,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9357,19 +9383,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX942-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -9408,19 +9434,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -9438,25 +9464,25 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -9473,19 +9499,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 6275afd2c6994..dd65ab3d00776 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -34,19 +34,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -85,19 +85,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -113,25 +113,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -148,19 +148,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -211,24 +211,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -261,23 +261,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -288,24 +288,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -320,20 +320,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -402,7 +402,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -414,22 +414,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX942-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX942-NEXT: v_min_f32_e32 v6, v4, v9 +; GFX942-NEXT: v_max_f32_e32 v6, v9, v9 +; GFX942-NEXT: v_min_f32_e32 v8, v6, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -443,21 +443,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB2_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -522,7 +522,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -534,22 +534,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9 +; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9 +; GFX90A-NEXT: v_min_f32_e32 v8, v6, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -561,27 +561,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -593,23 +593,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX908-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v4, v6, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v4, v8 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_max_f32_e32 v6, v8, v8 +; GFX908-NEXT: v_min_f32_e32 v7, v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -621,21 +621,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -774,19 +774,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -803,27 +803,28 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -835,27 +836,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -866,19 +867,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -894,25 +895,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -929,19 +930,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -958,19 +959,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -987,20 +988,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1032,19 +1033,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -1083,19 +1084,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1111,25 +1112,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 @@ -1146,19 +1147,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 @@ -1201,29 +1202,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1246,30 +1246,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1301,29 +1301,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1334,29 +1334,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1396,26 +1396,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[6:7], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1439,27 +1439,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1492,27 +1493,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1523,27 +1524,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1605,17 +1606,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[5:6], v[5:6] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[2:3], v[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1706,17 +1707,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1834,15 +1835,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 @@ -1900,15 +1901,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 @@ -2008,29 +2009,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2053,30 +2053,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2088,31 +2088,31 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2123,26 +2123,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v10, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 @@ -2153,29 +2153,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2186,29 +2186,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2219,29 +2219,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2252,30 +2252,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 -; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX6-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2296,29 +2296,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2341,30 +2340,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2396,29 +2395,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 @@ -2429,29 +2428,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 @@ -2495,46 +2494,46 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.l, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v1.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,46 +2545,46 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2593,300 +2592,309 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX942-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX942-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 +; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX90A-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2896,7 +2904,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2904,30 +2912,31 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2937,7 +2946,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2955,45 +2964,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.l, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v1.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3005,45 +3014,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3051,31 +3060,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX942-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX942-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX942-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX942-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX942-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX942-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3086,154 +3096,159 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 +; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX90A-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3244,31 +3259,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3279,32 +3295,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3315,34 +3332,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3353,35 +3371,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3401,15 +3420,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3423,29 +3442,28 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v6.l, v6.l, v6.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v6.l, v6.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3460,14 +3478,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3475,7 +3493,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3486,15 +3504,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3508,30 +3526,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v8, v5, v5 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v6, v6 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v6, v6, v8 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, v4, v10 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3546,14 +3563,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -3561,7 +3578,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3569,12 +3586,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX942-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX942-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX942-NEXT: v_not_b32_e32 v10, v4 +; GFX942-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX942-NEXT: v_not_b32_e32 v11, v6 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -3586,24 +3603,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX942-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX942-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX942-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3617,36 +3634,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v8 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB12_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3658,29 +3675,28 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v6.l, v6.l, v6.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v6.l, v6.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3694,14 +3710,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3710,22 +3726,22 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3737,30 +3753,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v8, v5, v5 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v6, v6, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3774,14 +3789,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3790,20 +3805,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v11, v7 ; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3813,26 +3828,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: v_max_f16_e32 v8, v5, v5 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX10-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -3844,15 +3859,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3861,19 +3876,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 -; GFX90A-NEXT: v_not_b32_e32 v10, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v11, v6 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3885,24 +3900,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3914,33 +3929,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX908-NEXT: v_not_b32_e32 v9, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v11, v6 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3952,25 +3967,25 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX908-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3982,33 +3997,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX8-NEXT: v_not_b32_e32 v9, v4 +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v11, v6 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -4020,26 +4035,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX8-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -4051,21 +4066,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -4228,20 +4243,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -4251,31 +4266,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4287,27 +4302,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4317,24 +4332,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4342,32 +4356,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 -; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v5, s6 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX942-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4377,28 +4392,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4408,56 +4423,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4467,95 +4482,97 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4565,38 +4582,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4606,40 +4624,41 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4649,37 +4668,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4689,7 +4709,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4697,31 +4717,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4731,7 +4752,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4751,53 +4772,53 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4809,53 +4830,52 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_and_b32 s5, s16, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4863,32 +4883,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_addk_i32 s16, 0x200 -; GFX942-NEXT: s_and_b32 s4, s16, -4 -; GFX942-NEXT: v_mov_b32_e32 v2, s4 -; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX942-NEXT: s_and_b32 s6, s16, -4 +; GFX942-NEXT: v_mov_b32_e32 v1, s6 +; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX942-NEXT: s_and_b32 s4, s16, 3 -; GFX942-NEXT: s_lshl_b32 s6, s4, 3 -; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX942-NEXT: s_not_b32 s7, s4 +; GFX942-NEXT: s_lshl_b32 s7, s4, 3 +; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX942-NEXT: s_not_b32 s8, s4 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX942-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4904,175 +4925,177 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_not_b32 s7, s6 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_mov_b32 s6, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s16, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_not_b32 s7, s6 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_mov_b32 s6, 0 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5088,36 +5111,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5128,38 +5152,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5170,35 +5195,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5209,36 +5235,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6145,28 +6172,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v1, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6179,21 +6204,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 +; GFX942-NEXT: v_pk_min_f16 v4, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6209,28 +6233,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v1, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6242,27 +6265,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v1, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -6273,19 +6296,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6301,25 +6324,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 @@ -6330,29 +6353,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -6375,30 +6398,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 @@ -6421,31 +6444,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 @@ -6467,25 +6490,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v1, v0, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_min_num_f16 v1, v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6499,25 +6521,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6528,25 +6549,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_min_f16 v1, v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6559,25 +6580,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_min_f16 v1, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -6589,23 +6610,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6616,24 +6637,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v1, v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6644,28 +6665,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, v5, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6676,41 +6697,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 @@ -6722,42 +6743,42 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 @@ -6778,7 +6799,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6793,25 +6814,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 +; GFX12-NEXT: v_pk_max_num_f16 v6, v8, v8 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_pk_min_num_f16 v7, v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6826,14 +6847,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6841,14 +6862,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6860,23 +6881,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX942-NEXT: v_pk_max_f16 v6, v9, v9 ; GFX942-NEXT: s_mov_b64 s[8:9], exec -; GFX942-NEXT: v_pk_min_f16 v6, v4, v9 +; GFX942-NEXT: v_pk_min_f16 v8, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -6889,27 +6910,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB18_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6923,25 +6944,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v6, v8, v8 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v5, v4, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_pk_min_f16 v7, v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -6955,14 +6976,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6971,13 +6992,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6989,24 +7010,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX10-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v6, v8, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v5, v4, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_pk_min_f16 v7, v6, v4 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -7018,15 +7039,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -7035,13 +7056,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7053,22 +7074,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 -; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 +; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9 +; GFX90A-NEXT: v_pk_min_f16 v8, v6, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7080,27 +7101,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7112,23 +7133,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX908-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 -; GFX908-NEXT: v_pk_min_f16 v5, v4, v8 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_pk_max_f16 v6, v8, v8 +; GFX908-NEXT: v_pk_min_f16 v7, v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7140,27 +7161,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7172,27 +7193,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v7, v8, v8 +; GFX8-NEXT: v_min_f16_e32 v6, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7204,21 +7225,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -7395,48 +7416,46 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-TRUE16-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v1, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v5, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7453,46 +7472,44 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-FAKE16-NEXT: s_add_co_i32 s6, s16, 0x400 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7505,41 +7522,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX942-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX942-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v1, v4, v1 +; GFX942-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX942-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX942-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB19_1 @@ -7550,48 +7567,48 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v5, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7604,46 +7621,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7656,41 +7672,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_min_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -7701,40 +7717,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 @@ -7745,41 +7761,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -7790,42 +7806,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -7837,38 +7853,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7879,39 +7895,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7931,43 +7947,46 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX12-TRUE16-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_min_num_f32 v0, v0, v3 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v3, v1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX12-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7984,42 +8003,44 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX12-FAKE16-NEXT: s_add_co_i32 s6, s16, 0x400 ; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v3, v1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8033,40 +8054,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s16 -; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX942-NEXT: s_add_i32 s4, s16, 0x400 +; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX942-NEXT: s_add_i32 s8, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[6:7], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX942-NEXT: s_movk_i32 s8, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX942-NEXT: s_mov_b32 s9, 0x7060302 -; GFX942-NEXT: v_mov_b32_e32 v4, s4 +; GFX942-NEXT: s_movk_i32 s9, 0x7fff +; GFX942-NEXT: s_mov_b32 s10, 0x7060302 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX942-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX942-NEXT: v_min_f32_e32 v2, v5, v4 +; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, s8 +; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX942-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8076,45 +8097,47 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_min_f32 v0, v0, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8127,43 +8150,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8177,39 +8202,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -8221,39 +8246,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_min_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8264,40 +8289,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8308,41 +8333,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8354,37 +8379,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8396,38 +8421,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8447,7 +8472,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8462,44 +8487,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-TRUE16-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v8 :: v_dual_min_num_f32 v4, v4, v9 -; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v10, v7 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8514,14 +8540,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8529,7 +8555,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8540,7 +8566,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8555,43 +8581,44 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-FAKE16-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8 -; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v6, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v10, v7 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX12-FAKE16-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8606,14 +8633,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8621,14 +8648,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX942-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX942-NEXT: s_mov_b64 s[2:3], exec ; GFX942-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -8640,40 +8667,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX942-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX942-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX942-NEXT: ; implicit-def: $vgpr4 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: s_mov_b64 exec, s[2:3] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX942-NEXT: s_movk_i32 s10, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX942-NEXT: s_mov_b32 s11, 0x7060302 ; GFX942-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX942-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX942-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s10 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX942-NEXT: s_mov_b64 s[8:9], exec ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v10 -; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX942-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX942-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX942-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX942-NEXT: v_add3_u32 v7, v7, v6, s10 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX942-NEXT: v_perm_b32 v8, v6, v4, s11 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: v_readfirstlane_b32 s4, v0 @@ -8686,27 +8713,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_4 ; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX942-NEXT: s_mov_b64 exec, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v7, v4 +; GFX942-NEXT: v_mov_b32_e32 v9, v6 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB21_3 ; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -8720,42 +8747,43 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX11-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v8 :: v_dual_min_f32 v4, v4, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v10, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8769,14 +8797,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -8785,13 +8813,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -8805,42 +8833,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v10, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8854,14 +8881,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 ; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -8869,15 +8896,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -8889,38 +8915,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v10, v7 +; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -8932,15 +8958,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -8949,13 +8975,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -8967,38 +8993,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v10 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX90A-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9010,27 +9036,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -9042,39 +9068,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v8 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX908-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v9 -; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX908-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -9086,27 +9112,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -9118,40 +9144,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX8-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v9 -; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX8-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -9163,21 +9189,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9357,19 +9383,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX942-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX942-NEXT: s_add_i32 s6, s16, 0x400 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, s6 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -9408,19 +9434,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -9438,25 +9464,25 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -9473,19 +9499,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll b/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll index d07cc84865bea..94085ad99ccda 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll +++ b/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll @@ -18,11 +18,9 @@ define amdgpu_kernel void @hoge(i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4) { ; CHECK-NEXT: s_bitcmp1_b32 s2, 24 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: .LBB0_1: ; %bb25 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb30 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index 31c23b94a8de8..53e9306758509 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -361,14 +361,14 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split ; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base ; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 -; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base ; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 ; GISEL-ASM-NEXT: .LBB11_3: ; %finally ; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5] -; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[4:5] +; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) ; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll index 2558da401f89a..5ee1c94cb1278 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -8,11 +8,9 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1010-LABEL: combine_add_zext_xor: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB0_2 ; GFX1010-NEXT: .LBB0_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -24,7 +22,7 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1010-NEXT: s_cbranch_vccz .LBB0_4 ; GFX1010-NEXT: .LBB0_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1010-NEXT: ; implicit-def: $sgpr5 ; GFX1010-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1010-NEXT: ; %bb.3: ; %bb @@ -39,12 +37,9 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1100-LABEL: combine_add_zext_xor: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB0_2 ; GFX1100-NEXT: .LBB0_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -58,8 +53,7 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1100-NEXT: s_cbranch_vccz .LBB0_4 ; GFX1100-NEXT: .LBB0_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr1 ; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1100-NEXT: ; %bb.3: ; %bb @@ -101,11 +95,9 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1010-LABEL: combine_sub_zext_xor: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB1_2 ; GFX1010-NEXT: .LBB1_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -117,7 +109,7 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1010-NEXT: s_cbranch_vccz .LBB1_4 ; GFX1010-NEXT: .LBB1_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1010-NEXT: ; implicit-def: $sgpr5 ; GFX1010-NEXT: s_cbranch_vccnz .LBB1_1 ; GFX1010-NEXT: ; %bb.3: ; %bb @@ -132,12 +124,9 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1100-LABEL: combine_sub_zext_xor: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB1_2 ; GFX1100-NEXT: .LBB1_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -151,8 +140,7 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1100-NEXT: s_cbranch_vccz .LBB1_4 ; GFX1100-NEXT: .LBB1_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr1 ; GFX1100-NEXT: s_cbranch_vccnz .LBB1_1 ; GFX1100-NEXT: ; %bb.3: ; %bb @@ -195,26 +183,24 @@ define i32 @combine_add_zext_or(i32 inreg %cond) { ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 -; GFX1010-NEXT: s_mov_b32 s5, 0 -; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 +; GFX1010-NEXT: s_mov_b32 s4, 0 +; GFX1010-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1010-NEXT: s_branch .LBB2_2 ; GFX1010-NEXT: .LBB2_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX1010-NEXT: s_cmpk_gt_i32 s5, 0xfbe6 +; GFX1010-NEXT: s_cmpk_gt_i32 s4, 0xfbe6 ; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1010-NEXT: s_add_i32 s5, s5, 1 +; GFX1010-NEXT: s_add_i32 s4, s4, 1 ; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s7 ; GFX1010-NEXT: s_cbranch_vccz .LBB2_4 ; GFX1010-NEXT: .LBB2_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s5 ; GFX1010-NEXT: ; implicit-def: $sgpr6 ; GFX1010-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX1010-NEXT: ; %bb.3: ; %bb ; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX1010-NEXT: v_mov_b32_e32 v0, s5 +; GFX1010-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-NEXT: buffer_load_dword v0, v0, s[4:7], 64 offen glc ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 @@ -228,28 +214,24 @@ define i32 @combine_add_zext_or(i32 inreg %cond) { ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1100-NEXT: s_mov_b32 s1, 0 -; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 +; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1100-NEXT: s_branch .LBB2_2 ; GFX1100-NEXT: .LBB2_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX1100-NEXT: s_cmpk_gt_i32 s1, 0xfbe6 +; GFX1100-NEXT: s_cmpk_gt_i32 s0, 0xfbe6 ; GFX1100-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1100-NEXT: s_add_i32 s1, s1, 1 +; GFX1100-NEXT: s_add_i32 s0, s0, 1 ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s3 ; GFX1100-NEXT: s_cbranch_vccz .LBB2_4 ; GFX1100-NEXT: .LBB2_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 ; GFX1100-NEXT: ; implicit-def: $sgpr2 ; GFX1100-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX1100-NEXT: ; %bb.3: ; %bb ; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX1100-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-NEXT: v_mov_b32_e32 v0, s0 ; GFX1100-NEXT: buffer_load_b32 v0, v0, s[0:3], 64 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: v_cmp_eq_u32_e64 s2, 0, v0 @@ -292,26 +274,24 @@ define i32 @combine_sub_zext_or(i32 inreg %cond) { ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 -; GFX1010-NEXT: s_mov_b32 s5, 0 -; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 +; GFX1010-NEXT: s_mov_b32 s4, 0 +; GFX1010-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1010-NEXT: s_branch .LBB3_2 ; GFX1010-NEXT: .LBB3_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX1010-NEXT: s_cmpk_gt_i32 s5, 0xfbe6 +; GFX1010-NEXT: s_cmpk_gt_i32 s4, 0xfbe6 ; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1010-NEXT: s_add_i32 s5, s5, -1 +; GFX1010-NEXT: s_add_i32 s4, s4, -1 ; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s7 ; GFX1010-NEXT: s_cbranch_vccz .LBB3_4 ; GFX1010-NEXT: .LBB3_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s5 ; GFX1010-NEXT: ; implicit-def: $sgpr6 ; GFX1010-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX1010-NEXT: ; %bb.3: ; %bb ; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX1010-NEXT: v_mov_b32_e32 v0, s5 +; GFX1010-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-NEXT: buffer_load_dword v0, v0, s[4:7], 64 offen glc ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 @@ -325,28 +305,24 @@ define i32 @combine_sub_zext_or(i32 inreg %cond) { ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1100-NEXT: s_mov_b32 s1, 0 -; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 +; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1100-NEXT: s_branch .LBB3_2 ; GFX1100-NEXT: .LBB3_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX1100-NEXT: s_cmpk_gt_i32 s1, 0xfbe6 +; GFX1100-NEXT: s_cmpk_gt_i32 s0, 0xfbe6 ; GFX1100-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1100-NEXT: s_add_i32 s1, s1, -1 +; GFX1100-NEXT: s_add_i32 s0, s0, -1 ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s3 ; GFX1100-NEXT: s_cbranch_vccz .LBB3_4 ; GFX1100-NEXT: .LBB3_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 ; GFX1100-NEXT: ; implicit-def: $sgpr2 ; GFX1100-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX1100-NEXT: ; %bb.3: ; %bb ; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX1100-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-NEXT: v_mov_b32_e32 v0, s0 ; GFX1100-NEXT: buffer_load_b32 v0, v0, s[0:3], 64 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: v_cmp_eq_u32_e64 s2, 0, v0 @@ -388,11 +364,9 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1010-LABEL: combine_add_zext_and: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB4_2 ; GFX1010-NEXT: .LBB4_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1 @@ -403,7 +377,7 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1010-NEXT: s_cbranch_vccz .LBB4_4 ; GFX1010-NEXT: .LBB4_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1010-NEXT: ; implicit-def: $sgpr5 ; GFX1010-NEXT: s_cbranch_vccnz .LBB4_1 ; GFX1010-NEXT: ; %bb.3: ; %bb @@ -418,12 +392,9 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1100-LABEL: combine_add_zext_and: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB4_2 ; GFX1100-NEXT: .LBB4_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 @@ -436,8 +407,7 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1100-NEXT: s_cbranch_vccz .LBB4_4 ; GFX1100-NEXT: .LBB4_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr1 ; GFX1100-NEXT: s_cbranch_vccnz .LBB4_1 ; GFX1100-NEXT: ; %bb.3: ; %bb @@ -480,11 +450,9 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1010-LABEL: combine_sub_zext_and: ; GFX1010: ; %bb.0: ; %.entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s16, 0 ; GFX1010-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX1010-NEXT: v_cmp_ne_u32_e64 s4, 1, v0 ; GFX1010-NEXT: s_branch .LBB5_2 ; GFX1010-NEXT: .LBB5_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1 @@ -495,7 +463,7 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1010-NEXT: s_cbranch_vccz .LBB5_4 ; GFX1010-NEXT: .LBB5_2: ; %.a ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1010-NEXT: ; implicit-def: $sgpr5 ; GFX1010-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX1010-NEXT: ; %bb.3: ; %bb @@ -510,12 +478,9 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1100-LABEL: combine_sub_zext_and: ; GFX1100: ; %bb.0: ; %.entry ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB5_2 ; GFX1100-NEXT: .LBB5_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 @@ -528,8 +493,7 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1100-NEXT: s_cbranch_vccz .LBB5_4 ; GFX1100-NEXT: .LBB5_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr1 ; GFX1100-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX1100-NEXT: ; %bb.3: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 72913d2596ebf..26999528a713f 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -8,30 +8,26 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4 +; CHECK-NEXT: s_load_dword s14, s[8:9], 0x4 ; CHECK-NEXT: s_add_u32 s24, s24, s17 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 -; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s2, 8 -; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s2, 16 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s0, 24 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_xor_b64 s[2:3], s[6:7], -1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s6, 8 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] +; CHECK-NEXT: s_bitcmp1_b32 s14, 8 ; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 -; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: s_and_b64 s[0:1], exec, s[2:3] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_and_b64 s[2:3], exec, s[8:9] ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[18:19], 0 @@ -44,17 +40,17 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_12 ; CHECK-NEXT: .LBB0_3: ; %bb7 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[10:11] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.4: ; %bb8 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[4:5] +; CHECK-NEXT: s_mov_b64 vcc, s[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 ; CHECK-NEXT: s_mov_b64 s[18:19], -1 -; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] +; CHECK-NEXT: s_mov_b64 s[22:23], s[8:9] ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_6: ; in Loop: Header=BB0_3 Depth=1 @@ -76,7 +72,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[16:17], 0 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 -; CHECK-NEXT: s_mov_b64 vcc, s[6:7] +; CHECK-NEXT: s_mov_b64 vcc, s[2:3] ; CHECK-NEXT: s_cbranch_vccz .LBB0_11 ; CHECK-NEXT: ; %bb.10: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -94,7 +90,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_cbranch_vccz .LBB0_16 ; CHECK-NEXT: ; %bb.13: ; %bb14 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_15 ; CHECK-NEXT: ; %bb.14: ; %bb15 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -114,10 +110,10 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[18:19] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_23 ; CHECK-NEXT: ; %bb.19: ; %bb17 -; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_21 ; CHECK-NEXT: ; %bb.20: ; %bb19 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccz .LBB0_22 ; CHECK-NEXT: .LBB0_21: ; %bb18 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll index d03d53a8cbbaa..1e408f6182b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll @@ -73,42 +73,40 @@ define amdgpu_cs void @issue130119(i1 %arg) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: s_mov_b32 s16, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_branch .LBB1_2 ; CHECK-NEXT: .LBB1_1: ; %Flow2 ; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[4:5], exec, s[6:7] +; CHECK-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB1_10 ; CHECK-NEXT: .LBB1_2: ; %bb1 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB1_4 Depth 2 -; CHECK-NEXT: s_and_b32 s2, s16, 1 -; CHECK-NEXT: s_cmp_eq_u32 s2, 0 +; CHECK-NEXT: s_and_b32 s4, s16, 1 +; CHECK-NEXT: s_cmp_eq_u32 s4, 0 ; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_cmp_eq_u32 s2, 1 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 +; CHECK-NEXT: s_cmp_eq_u32 s4, 1 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_mov_b64 s[10:11], 0 -; CHECK-NEXT: ; implicit-def: $sgpr8_sgpr9 +; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5 ; CHECK-NEXT: s_branch .LBB1_4 ; CHECK-NEXT: .LBB1_3: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; CHECK-NEXT: s_xor_b64 s[14:15], s[14:15], -1 ; CHECK-NEXT: s_and_b64 s[12:13], exec, s[12:13] ; CHECK-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; CHECK-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; CHECK-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; CHECK-NEXT: s_and_b64 s[12:13], s[14:15], exec -; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_cbranch_execz .LBB1_8 ; CHECK-NEXT: .LBB1_4: ; %bb3 ; CHECK-NEXT: ; Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_mov_b64 s[14:15], s[6:7] ; CHECK-NEXT: s_cbranch_vccnz .LBB1_6 ; CHECK-NEXT: ; %bb.5: ; %bb7 @@ -128,14 +126,14 @@ define amdgpu_cs void @issue130119(i1 %arg) { ; CHECK-NEXT: .LBB1_8: ; %loop.exit.guard ; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] -; CHECK-NEXT: s_mov_b64 s[2:3], -1 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], -1 +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[8:9] ; CHECK-NEXT: s_cbranch_execz .LBB1_1 ; CHECK-NEXT: ; %bb.9: ; %bb10 ; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: s_or_b32 s16, s16, 1 -; CHECK-NEXT: s_xor_b64 s[2:3], exec, -1 +; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1 ; CHECK-NEXT: s_branch .LBB1_1 ; CHECK-NEXT: .LBB1_10: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index ae5da3ad094c7..9399667427e16 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -18657,42 +18657,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18707,41 +18706,39 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18756,34 +18753,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB68_1 @@ -18797,33 +18794,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB68_1 @@ -18837,33 +18834,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB68_1 @@ -18877,34 +18874,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB68_1 @@ -18983,42 +18980,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19033,41 +19029,39 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19083,35 +19077,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB69_1 @@ -19124,33 +19118,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 @@ -19164,33 +19158,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB69_1 @@ -19206,34 +19200,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB69_1 @@ -19317,8 +19311,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -19326,33 +19318,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, v5, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19368,8 +19361,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -19377,33 +19368,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19418,35 +19409,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB70_1 @@ -19464,33 +19455,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX90A-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 @@ -19508,33 +19499,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB70_1 @@ -19549,34 +19540,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB70_1 @@ -19654,42 +19645,43 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19702,41 +19694,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19749,35 +19741,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB71_1 @@ -19788,36 +19780,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB71_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19827,36 +19819,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB71_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19866,37 +19858,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB71_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19970,42 +19962,43 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -20018,41 +20011,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20067,35 +20060,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB72_1 @@ -20106,36 +20099,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20145,36 +20138,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB72_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20186,37 +20179,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB72_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20299,42 +20292,43 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -20350,41 +20344,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20399,35 +20393,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB73_1 @@ -20445,28 +20439,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -20489,28 +20483,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -20528,37 +20522,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB73_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20637,42 +20631,41 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -20687,41 +20680,39 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20737,35 +20728,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB74_1 @@ -20778,35 +20769,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB74_1 @@ -20820,33 +20811,33 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB74_1 @@ -20862,34 +20853,34 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB74_1 @@ -20968,42 +20959,43 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -21016,41 +21008,41 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21065,35 +21057,35 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB75_1 @@ -21104,38 +21096,38 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB75_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21145,36 +21137,36 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB75_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21186,37 +21178,37 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB75_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21294,42 +21286,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -21344,41 +21335,39 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21393,34 +21382,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB76_1 @@ -21434,33 +21423,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB76_1 @@ -21474,33 +21463,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB76_1 @@ -21514,34 +21503,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB76_1 @@ -21619,42 +21608,43 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -21667,41 +21657,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21714,35 +21704,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB77_1 @@ -21753,36 +21743,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB77_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21792,36 +21782,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB77_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21831,37 +21821,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB77_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21936,42 +21926,41 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -21986,41 +21975,39 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22035,34 +22022,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB78_1 @@ -22076,33 +22063,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 @@ -22116,33 +22103,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB78_1 @@ -22156,34 +22143,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB78_1 @@ -22261,42 +22248,43 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -22309,41 +22297,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22356,35 +22344,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB79_1 @@ -22395,36 +22383,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22434,36 +22422,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB79_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22473,37 +22461,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB79_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 6218a5c82afcd..cb3813a4f57f0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -34,13 +34,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -79,13 +79,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -103,13 +103,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -177,13 +177,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -224,13 +224,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -248,13 +248,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -334,18 +334,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 @@ -388,17 +388,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 @@ -416,17 +416,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 @@ -491,21 +491,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -537,20 +537,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -560,20 +560,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -632,21 +632,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -680,20 +680,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -703,20 +703,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -784,23 +784,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -844,12 +844,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -872,12 +872,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -950,13 +950,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -997,13 +997,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1023,13 +1023,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1100,21 +1100,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1148,22 +1148,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1173,20 +1173,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1252,13 +1252,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1276,15 +1276,14 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1304,14 +1303,14 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1331,13 +1330,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1355,13 +1354,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1445,13 +1444,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1490,13 +1489,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1514,13 +1513,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1592,13 +1591,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1637,13 +1636,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1661,13 +1660,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1735,13 +1734,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1782,13 +1781,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1806,13 +1805,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1892,18 +1891,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX942-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 @@ -1946,17 +1945,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 @@ -1974,17 +1973,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -2049,21 +2048,21 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2095,20 +2094,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2118,20 +2117,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2190,21 +2189,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2238,20 +2237,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2261,20 +2260,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2342,23 +2341,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2402,12 +2401,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2430,12 +2429,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2508,13 +2507,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2555,13 +2554,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2581,13 +2580,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2658,21 +2657,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2706,22 +2705,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2731,20 +2730,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2802,29 +2801,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2833,25 +2832,26 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2898,29 +2898,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -2928,22 +2928,23 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB18_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3033,90 +3034,95 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB18_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_6 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB18_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB18_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB18_2 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB18_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_6 @@ -3124,17 +3130,18 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3192,7 +3199,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo @@ -3219,9 +3225,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3240,6 +3247,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3295,7 +3303,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo @@ -3318,9 +3325,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3339,6 +3347,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB19_2 ; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3436,7 +3445,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3459,8 +3467,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3476,6 +3485,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB19_2 ; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3491,7 +3501,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 @@ -3519,8 +3528,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3536,6 +3546,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB19_2 ; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3604,7 +3615,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo @@ -3631,9 +3641,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3652,6 +3663,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3708,7 +3720,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo @@ -3731,9 +3742,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3752,6 +3764,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3849,7 +3862,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3872,8 +3884,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3889,6 +3902,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB20_2 ; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3904,7 +3918,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -3932,8 +3945,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3949,6 +3963,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -4017,7 +4032,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4034,20 +4048,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4056,19 +4071,20 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4113,7 +4129,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 @@ -4126,21 +4141,22 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -4148,18 +4164,19 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB21_2 ; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4245,7 +4262,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4258,38 +4274,40 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB21_2 ; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4297,7 +4315,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4312,42 +4329,44 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[2:3] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4403,14 +4422,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4423,20 +4441,21 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4444,20 +4463,21 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4504,13 +4524,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4520,40 +4539,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB22_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4643,11 +4664,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_3 @@ -4658,38 +4678,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB22_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB22_2 ; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4697,13 +4719,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_3 @@ -4714,42 +4735,44 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB22_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4808,14 +4831,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4828,20 +4850,21 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4849,20 +4872,21 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4910,13 +4934,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB23_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4926,40 +4949,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB23_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB23_2 ; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5049,11 +5074,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_3 @@ -5064,38 +5088,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB23_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB23_2 ; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5103,13 +5129,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_3 @@ -5120,42 +5145,44 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB23_2 ; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5214,29 +5241,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5245,25 +5272,26 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5310,29 +5338,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5340,76 +5368,82 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB24_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execz .LBB24_4 -; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_cbranch_execnz .LBB24_3 +; GFX10-NEXT: ; %bb.1: ; %Flow2 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB24_6 +; GFX10-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB24_3: ; %atomicrmw.global +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX10-NEXT: .LBB24_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB24_2 -; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_cbranch_execnz .LBB24_4 +; GFX10-NEXT: ; %bb.5: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX10-NEXT: .LBB24_4: ; %Flow2 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 -; GFX10-NEXT: s_cbranch_execz .LBB24_6 -; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB24_2 +; GFX10-NEXT: .LBB24_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5419,7 +5453,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5437,8 +5470,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -5457,6 +5491,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] @@ -5469,90 +5504,95 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB24_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB24_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB24_6 +; GFX908-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB24_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB24_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB24_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB24_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB24_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB24_2 +; GFX908-NEXT: .LBB24_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB24_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB24_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_6 @@ -5560,17 +5600,18 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5579,37 +5620,37 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_4 ; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[2:3] -; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] +; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB24_2 ; GFX7-NEXT: ; %bb.3: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7-NEXT: .LBB24_4: ; %Flow2 ; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_6 @@ -5617,17 +5658,18 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -5642,29 +5684,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5673,25 +5715,26 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5738,29 +5781,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5768,22 +5811,23 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB25_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5873,90 +5917,95 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB25_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB25_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB25_6 +; GFX908-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB25_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB25_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB25_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB25_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB25_2 +; GFX908-NEXT: .LBB25_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB25_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_6 @@ -5964,17 +6013,18 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6037,9 +6087,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6051,12 +6100,13 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -6084,9 +6134,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6098,11 +6147,12 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6135,14 +6185,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -6162,9 +6212,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6176,12 +6225,13 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -6204,9 +6254,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6218,11 +6267,12 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6246,7 +6296,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6258,9 +6307,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6289,14 +6339,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6323,14 +6373,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6357,17 +6407,17 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6428,35 +6478,33 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6467,7 +6515,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6481,25 +6529,24 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6534,14 +6581,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -6560,35 +6607,33 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6599,19 +6644,18 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6621,11 +6665,12 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6650,10 +6695,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6662,9 +6706,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6694,14 +6739,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6729,14 +6774,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6764,17 +6809,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6837,35 +6882,33 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6876,7 +6919,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6890,25 +6933,24 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6944,14 +6986,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -6970,35 +7012,33 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7009,19 +7049,18 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7031,11 +7070,12 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7060,10 +7100,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7072,9 +7111,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7104,14 +7144,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -7139,14 +7179,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -7174,17 +7214,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7248,9 +7288,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7262,10 +7301,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -7293,9 +7333,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7307,9 +7346,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7343,13 +7383,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v7 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -7369,9 +7409,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7383,10 +7422,11 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -7409,9 +7449,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7423,9 +7462,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7450,7 +7490,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -7462,8 +7501,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7492,13 +7532,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc @@ -7525,13 +7565,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc @@ -7558,16 +7598,16 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7626,37 +7666,35 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7674,37 +7712,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7722,29 +7759,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7754,38 +7791,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7797,37 +7832,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7839,31 +7874,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -7874,31 +7909,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7908,31 +7943,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7942,32 +7977,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8022,37 +8057,35 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8070,37 +8103,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8119,29 +8151,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8151,38 +8183,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8194,37 +8224,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8236,31 +8266,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -8271,31 +8301,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8305,31 +8335,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8339,32 +8369,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8420,16 +8450,16 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8456,15 +8486,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -8489,14 +8519,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 @@ -8515,16 +8545,16 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8546,15 +8576,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -8578,23 +8608,23 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v0, v5, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -8607,14 +8637,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8633,14 +8663,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v5 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8661,19 +8691,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -8724,15 +8754,15 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8757,24 +8787,24 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8788,23 +8818,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f16_e32 v3, v4, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8815,15 +8845,15 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8843,25 +8873,25 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8875,23 +8905,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -8902,22 +8932,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v3, v4, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8927,22 +8957,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8954,22 +8984,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9017,36 +9047,34 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9057,7 +9085,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9071,25 +9099,24 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -9125,14 +9152,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 @@ -9151,35 +9178,33 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9190,19 +9215,18 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9212,11 +9236,12 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -9241,10 +9266,9 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -9253,9 +9277,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9285,14 +9310,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -9322,14 +9347,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -9357,17 +9382,17 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -9430,38 +9455,36 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -9479,38 +9502,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -9528,29 +9550,29 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9560,38 +9582,36 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9603,37 +9623,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9645,31 +9665,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -9680,33 +9700,33 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9716,31 +9736,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9750,32 +9770,32 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14239,15 +14259,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14269,14 +14289,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14294,15 +14314,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14322,14 +14342,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14349,13 +14369,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14373,13 +14393,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14397,21 +14417,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -14475,15 +14495,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14505,14 +14525,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14530,15 +14550,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14559,21 +14579,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB47_1 @@ -14586,13 +14606,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14610,13 +14630,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14636,21 +14656,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -14714,15 +14734,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14752,19 +14772,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_pk_max_f16 v0, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX942-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 @@ -14778,22 +14798,22 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX11-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14807,21 +14827,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB48_1 @@ -14839,17 +14859,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 @@ -14867,17 +14887,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX908-NEXT: v_pk_max_f16 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB48_1 @@ -14892,21 +14912,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -14969,21 +14989,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14997,22 +15017,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB49_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15022,22 +15042,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15049,21 +15069,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -15074,20 +15094,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15097,20 +15117,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15120,24 +15140,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15196,21 +15216,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15224,22 +15244,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15249,22 +15269,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15278,21 +15298,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -15303,20 +15323,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15326,20 +15346,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15351,24 +15371,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15430,21 +15450,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15462,24 +15482,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15492,22 +15512,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15521,21 +15541,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15553,12 +15573,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -15581,12 +15601,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX908-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15604,24 +15624,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15684,15 +15704,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -15715,14 +15735,14 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15740,15 +15760,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15769,21 +15789,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -15796,13 +15816,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15822,13 +15842,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15848,21 +15868,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15925,22 +15945,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15954,22 +15974,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15979,22 +15999,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16008,21 +16028,21 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -16033,22 +16053,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16058,20 +16078,20 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16083,24 +16103,24 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16167,41 +16187,40 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16221,39 +16240,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16270,35 +16288,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 @@ -16311,42 +16329,41 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16361,41 +16378,39 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16410,34 +16425,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -16451,33 +16466,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -16491,33 +16506,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -16531,34 +16546,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -16618,41 +16633,40 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16672,39 +16686,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16721,35 +16734,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 @@ -16762,42 +16775,41 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16812,41 +16824,39 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16862,35 +16872,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -16903,33 +16913,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -16943,33 +16953,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -16985,34 +16995,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -17072,41 +17082,40 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17126,39 +17135,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17183,35 +17191,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX942-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX942-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 @@ -17225,8 +17233,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -17234,33 +17240,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_max_f32 v0, v5, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17276,8 +17283,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -17285,33 +17290,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17326,35 +17331,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -17372,33 +17377,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX90A-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -17416,33 +17421,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -17457,34 +17462,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -17543,41 +17548,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17595,40 +17600,40 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17642,38 +17647,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB57_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17683,42 +17688,43 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17731,41 +17737,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17778,35 +17784,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -17817,36 +17823,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17856,36 +17862,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17895,37 +17901,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17980,41 +17986,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18030,42 +18036,42 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18079,38 +18085,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB58_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18120,42 +18126,43 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18168,41 +18175,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18217,35 +18224,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -18256,36 +18263,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18295,36 +18302,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18336,37 +18343,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18424,41 +18431,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18476,40 +18483,40 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18527,40 +18534,40 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB59_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18573,42 +18580,43 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18624,41 +18632,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18673,35 +18681,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -18719,28 +18727,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -18763,28 +18771,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -18802,37 +18810,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18891,42 +18899,41 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18946,40 +18953,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18996,35 +19002,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 @@ -19037,42 +19043,41 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19087,41 +19092,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19137,35 +19140,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -19178,35 +19181,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -19220,33 +19223,33 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -19262,34 +19265,34 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -19348,42 +19351,42 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -19401,41 +19404,41 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -19449,38 +19452,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB61_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19490,42 +19493,43 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19538,41 +19542,41 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19587,35 +19591,35 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -19626,38 +19630,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19667,36 +19671,36 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19708,37 +19712,37 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 6eafbb50e4bb9..a55e627966cb7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -34,13 +34,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -79,13 +79,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -103,13 +103,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -177,13 +177,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -224,13 +224,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -248,13 +248,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -334,18 +334,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 @@ -388,17 +388,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 @@ -416,17 +416,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 @@ -491,21 +491,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -537,20 +537,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -560,20 +560,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -632,21 +632,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -680,20 +680,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -703,20 +703,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -784,23 +784,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -844,12 +844,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -872,12 +872,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -950,13 +950,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -997,13 +997,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1023,13 +1023,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1100,21 +1100,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1148,22 +1148,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1173,20 +1173,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1252,13 +1252,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1276,15 +1276,14 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1304,14 +1303,14 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1331,13 +1330,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1355,13 +1354,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1445,13 +1444,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1490,13 +1489,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1514,13 +1513,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1592,13 +1591,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1637,13 +1636,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1661,13 +1660,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1735,13 +1734,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1782,13 +1781,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1806,13 +1805,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1892,18 +1891,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX942-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 @@ -1946,17 +1945,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 @@ -1974,17 +1973,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -2049,21 +2048,21 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2095,20 +2094,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2118,20 +2117,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2190,21 +2189,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2238,20 +2237,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2261,20 +2260,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2342,23 +2341,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2402,12 +2401,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2430,12 +2429,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2508,13 +2507,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2555,13 +2554,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2581,13 +2580,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2658,21 +2657,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2706,22 +2705,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2731,20 +2730,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2802,29 +2801,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2833,25 +2832,26 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2898,29 +2898,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -2928,22 +2928,23 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB18_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3033,90 +3034,95 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB18_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_6 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB18_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB18_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB18_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB18_2 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB18_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_6 @@ -3124,17 +3130,18 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3192,7 +3199,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo @@ -3219,9 +3225,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3240,6 +3247,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3295,7 +3303,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo @@ -3318,9 +3325,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3339,6 +3347,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB19_2 ; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3436,7 +3445,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3459,8 +3467,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3476,6 +3485,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB19_2 ; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3491,7 +3501,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 @@ -3519,8 +3528,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3536,6 +3546,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB19_2 ; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3604,7 +3615,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo @@ -3631,9 +3641,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3652,6 +3663,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3708,7 +3720,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo @@ -3731,9 +3742,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3752,6 +3764,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3849,7 +3862,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3872,8 +3884,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3889,6 +3902,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB20_2 ; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3904,7 +3918,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -3932,8 +3945,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3949,6 +3963,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -4017,7 +4032,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4034,20 +4048,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4056,19 +4071,20 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4113,7 +4129,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 @@ -4126,21 +4141,22 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -4148,18 +4164,19 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB21_2 ; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4245,7 +4262,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4258,38 +4274,40 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB21_2 ; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4297,7 +4315,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4312,42 +4329,44 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[2:3] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4403,14 +4422,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4423,20 +4441,21 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4444,20 +4463,21 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4504,13 +4524,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4520,40 +4539,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB22_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4643,11 +4664,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_3 @@ -4658,38 +4678,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB22_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB22_2 ; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4697,13 +4719,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_3 @@ -4714,42 +4735,44 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB22_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4808,14 +4831,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4828,20 +4850,21 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4849,20 +4872,21 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4910,13 +4934,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB23_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4926,40 +4949,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB23_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB23_2 ; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5049,11 +5074,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_3 @@ -5064,38 +5088,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB23_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB23_2 ; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5103,13 +5129,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_3 @@ -5120,42 +5145,44 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB23_2 ; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5214,29 +5241,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5245,25 +5272,26 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5310,29 +5338,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5340,76 +5368,82 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB24_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execz .LBB24_4 -; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_cbranch_execnz .LBB24_3 +; GFX10-NEXT: ; %bb.1: ; %Flow2 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB24_6 +; GFX10-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB24_3: ; %atomicrmw.global +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX10-NEXT: .LBB24_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX10-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB24_2 -; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_cbranch_execnz .LBB24_4 +; GFX10-NEXT: ; %bb.5: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX10-NEXT: .LBB24_4: ; %Flow2 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 -; GFX10-NEXT: s_cbranch_execz .LBB24_6 -; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB24_2 +; GFX10-NEXT: .LBB24_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5419,7 +5453,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5437,8 +5470,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -5457,6 +5491,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] @@ -5469,90 +5504,95 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB24_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB24_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB24_6 +; GFX908-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB24_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB24_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB24_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB24_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB24_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB24_2 +; GFX908-NEXT: .LBB24_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB24_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB24_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_6 @@ -5560,17 +5600,18 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5579,37 +5620,37 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_4 ; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[2:3] -; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] +; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB24_2 ; GFX7-NEXT: ; %bb.3: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7-NEXT: .LBB24_4: ; %Flow2 ; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_6 @@ -5617,17 +5658,18 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -5642,29 +5684,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5673,25 +5715,26 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5738,29 +5781,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5768,22 +5811,23 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB25_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5873,90 +5917,95 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB25_4 -; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_cbranch_execnz .LBB25_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB25_6 +; GFX908-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB25_2 -; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_cbranch_execnz .LBB25_4 +; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX908-NEXT: .LBB25_4: ; %Flow2 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB25_6 -; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_cbranch_execz .LBB25_2 +; GFX908-NEXT: .LBB25_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB25_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_6 @@ -5964,17 +6013,18 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -6037,9 +6087,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6051,12 +6100,13 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -6084,9 +6134,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6098,11 +6147,12 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6135,14 +6185,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -6162,9 +6212,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6176,12 +6225,13 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -6204,9 +6254,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] @@ -6218,11 +6267,12 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6246,7 +6296,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6258,9 +6307,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6289,14 +6339,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6323,14 +6373,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6357,17 +6407,17 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6428,35 +6478,33 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6467,7 +6515,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6481,25 +6529,24 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6534,14 +6581,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -6560,35 +6607,33 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6599,19 +6644,18 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6621,11 +6665,12 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6650,10 +6695,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6662,9 +6706,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6694,14 +6739,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6729,14 +6774,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6764,17 +6809,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6837,35 +6882,33 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6876,7 +6919,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6890,25 +6933,24 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6944,14 +6986,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -6970,35 +7012,33 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7009,19 +7049,18 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7031,11 +7070,12 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7060,10 +7100,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7072,9 +7111,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7104,14 +7144,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -7139,14 +7179,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -7174,17 +7214,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7248,9 +7288,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7262,10 +7301,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -7293,9 +7333,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7307,9 +7346,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7343,13 +7383,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v7 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -7369,9 +7409,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7383,10 +7422,11 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -7409,9 +7449,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] @@ -7423,9 +7462,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7450,7 +7490,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -7462,8 +7501,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7492,13 +7532,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc @@ -7525,13 +7565,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc @@ -7558,16 +7598,16 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7626,37 +7666,35 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7674,37 +7712,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7722,29 +7759,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7754,38 +7791,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 -; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7797,37 +7832,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7839,31 +7874,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -7874,31 +7909,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7908,31 +7943,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7942,32 +7977,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8022,37 +8057,35 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8070,37 +8103,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8119,29 +8151,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8151,38 +8183,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8194,37 +8224,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8236,31 +8266,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -8271,31 +8301,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8305,31 +8335,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8339,32 +8369,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8420,16 +8450,16 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8456,15 +8486,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -8489,14 +8519,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX942-NEXT: v_min_f16_e32 v3, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 @@ -8515,16 +8545,16 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8546,15 +8576,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -8578,23 +8608,23 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v0, v5, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -8607,14 +8637,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX90A-NEXT: v_min_f16_e32 v3, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8633,14 +8663,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v5 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8661,19 +8691,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -8724,15 +8754,15 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8757,24 +8787,24 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8788,23 +8818,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f16_e32 v3, v4, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8815,15 +8845,15 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8843,25 +8873,25 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8875,23 +8905,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -8902,22 +8932,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v3, v4, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8927,22 +8957,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8954,22 +8984,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9017,36 +9047,34 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9057,7 +9085,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9071,25 +9099,24 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -9125,14 +9152,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 @@ -9151,35 +9178,33 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9190,19 +9215,18 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9212,11 +9236,12 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -9241,10 +9266,9 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -9253,9 +9277,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9285,14 +9310,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -9322,14 +9347,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -9357,17 +9382,17 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -9430,38 +9455,36 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -9479,38 +9502,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -9528,29 +9550,29 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: flat_load_dword v3, v[0:1] -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: flat_load_dword v5, v[0:1] +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9560,38 +9582,36 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9603,37 +9623,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9645,31 +9665,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -9680,33 +9700,33 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9716,31 +9736,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9750,32 +9770,32 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14239,15 +14259,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14269,14 +14289,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14294,15 +14314,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14322,14 +14342,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14349,13 +14369,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14373,13 +14393,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14397,21 +14417,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -14475,15 +14495,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14505,14 +14525,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14530,15 +14550,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14559,21 +14579,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB47_1 @@ -14586,13 +14606,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14610,13 +14630,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14636,21 +14656,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -14714,15 +14734,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14752,19 +14772,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_pk_max_f16 v0, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX942-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB48_1 @@ -14778,22 +14798,22 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX11-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX11-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14807,21 +14827,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB48_1 @@ -14839,17 +14859,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 @@ -14867,17 +14887,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX908-NEXT: v_pk_min_f16 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX908-NEXT: v_pk_min_f16 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB48_1 @@ -14892,21 +14912,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -14969,21 +14989,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14997,22 +15017,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB49_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15022,22 +15042,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15049,21 +15069,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -15074,20 +15094,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15097,20 +15117,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15120,24 +15140,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15196,21 +15216,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15224,22 +15244,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15249,22 +15269,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15278,21 +15298,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -15303,20 +15323,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15326,20 +15346,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15351,24 +15371,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15430,21 +15450,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15462,24 +15482,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15492,22 +15512,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15521,21 +15541,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15553,12 +15573,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -15581,12 +15601,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX908-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX908-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15604,24 +15624,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15684,15 +15704,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -15715,14 +15735,14 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15740,15 +15760,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15769,21 +15789,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -15796,13 +15816,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15822,13 +15842,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15848,21 +15868,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15925,22 +15945,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15954,22 +15974,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15979,22 +15999,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16008,21 +16028,21 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -16033,22 +16053,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16058,20 +16078,20 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16083,24 +16103,24 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16167,41 +16187,40 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16221,39 +16240,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16270,35 +16288,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 @@ -16311,42 +16329,41 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16361,41 +16378,39 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16410,34 +16425,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -16451,33 +16466,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -16491,33 +16506,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -16531,34 +16546,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -16618,41 +16633,40 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16672,39 +16686,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16721,35 +16734,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 @@ -16762,42 +16775,41 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16812,41 +16824,39 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16862,35 +16872,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -16903,33 +16913,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -16943,33 +16953,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -16985,34 +16995,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -17072,41 +17082,40 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17126,39 +17135,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17183,35 +17191,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX942-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX942-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 @@ -17225,8 +17233,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -17234,33 +17240,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_min_f32 v0, v5, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17276,8 +17283,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -17285,33 +17290,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17326,35 +17331,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -17372,33 +17377,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX90A-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -17416,33 +17421,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -17457,34 +17462,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -17543,41 +17548,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17595,40 +17600,40 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17642,38 +17647,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB57_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17683,42 +17688,43 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17731,41 +17737,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17778,35 +17784,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -17817,36 +17823,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17856,36 +17862,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17895,37 +17901,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17980,41 +17986,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18030,42 +18036,42 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18079,38 +18085,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB58_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18120,42 +18126,43 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18168,41 +18175,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18217,35 +18224,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -18256,36 +18263,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18295,36 +18302,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18336,37 +18343,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18424,41 +18431,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18476,40 +18483,40 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18527,40 +18534,40 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB59_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18573,42 +18580,43 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18624,41 +18632,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18673,35 +18681,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -18719,28 +18727,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -18763,28 +18771,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -18802,37 +18810,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18891,42 +18899,41 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18946,40 +18953,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18996,35 +19002,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 @@ -19037,42 +19043,41 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19087,41 +19092,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19137,35 +19140,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -19178,35 +19181,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -19220,33 +19223,33 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -19262,34 +19265,34 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -19348,42 +19351,42 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -19401,41 +19404,41 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -19449,38 +19452,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB61_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19490,42 +19493,43 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19538,41 +19542,41 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19587,35 +19591,35 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -19626,38 +19630,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19667,36 +19671,36 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19708,37 +19712,37 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 25f29c8c87c96..a81e8562be3fb 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -15586,41 +15586,40 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -15640,39 +15639,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -15689,35 +15687,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 @@ -15730,42 +15728,41 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15780,41 +15777,39 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15829,34 +15824,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -15870,33 +15865,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 @@ -15910,33 +15905,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 @@ -15950,34 +15945,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 @@ -16037,41 +16032,40 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16091,39 +16085,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16140,35 +16133,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 @@ -16181,42 +16174,41 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16231,41 +16223,39 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16281,35 +16271,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -16322,33 +16312,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 @@ -16362,33 +16352,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 @@ -16404,34 +16394,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 @@ -16491,41 +16481,40 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16545,39 +16534,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16602,35 +16590,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX942-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX942-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v0, v3, v0, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 @@ -16644,8 +16632,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -16653,33 +16639,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_sub_f32 v0, v5, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16695,8 +16682,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4] ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 @@ -16704,33 +16689,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16745,35 +16730,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -16791,33 +16776,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX90A-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 @@ -16835,33 +16820,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB52_1 @@ -16876,34 +16861,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -16962,41 +16947,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17014,40 +16999,40 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17061,38 +17046,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17102,42 +17087,43 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17150,41 +17136,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17197,35 +17183,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -17236,36 +17222,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17275,36 +17261,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17314,37 +17300,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17399,41 +17385,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17451,40 +17437,40 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17498,38 +17484,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17539,42 +17525,43 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17587,41 +17574,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17636,35 +17623,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -17675,36 +17662,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17714,36 +17701,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17755,37 +17742,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17843,41 +17830,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17895,40 +17882,40 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17946,40 +17933,40 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX942-NEXT: flat_load_dword v3, v[4:5] +; GFX942-NEXT: flat_load_dword v5, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17992,42 +17979,43 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18043,41 +18031,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18092,35 +18080,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -18138,28 +18126,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -18182,28 +18170,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX908-NEXT: v_sub_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_sub_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -18221,37 +18209,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18310,42 +18298,41 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18365,40 +18352,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18415,35 +18401,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 @@ -18456,42 +18442,41 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18506,41 +18491,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18556,35 +18539,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -18597,35 +18580,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -18639,33 +18622,33 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -18681,34 +18664,34 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -18767,42 +18750,42 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18820,41 +18803,41 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18868,38 +18851,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX942-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB57_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18909,42 +18892,43 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18957,41 +18941,41 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19006,35 +18990,35 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -19045,38 +19029,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19086,36 +19070,36 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX908-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19127,37 +19111,37 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index e74ad3d62bea4..f0c7a382c7909 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1345,18 +1345,20 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB34_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1368,18 +1370,20 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB34_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1391,18 +1395,20 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB34_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1420,22 +1426,24 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB35_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1445,22 +1453,24 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB35_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1468,18 +1478,20 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB35_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1497,18 +1509,18 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB36_1 @@ -1522,18 +1534,18 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB36_1 @@ -1547,18 +1559,18 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_subrev_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB36_1 @@ -1575,24 +1587,26 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB37_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1600,24 +1614,26 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB37_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1626,18 +1642,18 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_subrev_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB37_1 @@ -2007,18 +2023,20 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB44_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2030,18 +2048,20 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB44_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2053,18 +2073,20 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB44_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2082,22 +2104,24 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB45_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -2107,22 +2131,24 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB45_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -2130,18 +2156,20 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB45_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2159,18 +2187,18 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_and_b32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB46_1 @@ -2184,18 +2212,18 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_and_b32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB46_1 @@ -2209,18 +2237,18 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_and_b32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB46_1 @@ -2237,24 +2265,26 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_and_b32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB47_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -2262,24 +2292,26 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_and_b32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB47_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -2288,18 +2320,18 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_and_b32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB47_1 @@ -2681,19 +2713,21 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN1-NEXT: v_not_b32_e32 v2, v2 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2705,19 +2739,21 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN2-NEXT: v_not_b32_e32 v2, v2 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2729,19 +2765,21 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN3-NEXT: v_not_b32_e32 v2, v2 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_not_b32_e32 v0, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2759,23 +2797,25 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN1-NEXT: v_not_b32_e32 v2, v2 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2785,23 +2825,25 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN2-NEXT: v_not_b32_e32 v2, v2 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2809,19 +2851,21 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN3-NEXT: v_not_b32_e32 v2, v2 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_not_b32_e32 v0, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2839,19 +2883,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN1-NEXT: v_not_b32_e32 v3, v0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB56_1 @@ -2865,19 +2909,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN2-NEXT: v_not_b32_e32 v3, v0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB56_1 @@ -2891,19 +2935,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN3-NEXT: v_not_b32_e32 v3, v0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_not_b32_e32 v0, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB56_1 @@ -2920,25 +2964,27 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN1-NEXT: v_not_b32_e32 v3, v0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2946,25 +2992,27 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN2-NEXT: v_not_b32_e32 v3, v0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2973,19 +3021,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN3-NEXT: v_not_b32_e32 v3, v0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_not_b32_e32 v0, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB57_1 @@ -3440,18 +3488,20 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB64_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3463,18 +3513,20 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB64_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3486,18 +3538,20 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_or_b32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB64_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3515,22 +3569,24 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB65_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -3540,22 +3596,24 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB65_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -3563,18 +3621,20 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_or_b32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB65_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3592,18 +3652,18 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_or_b32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB66_1 @@ -3617,18 +3677,18 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_or_b32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB66_1 @@ -3642,18 +3702,18 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_or_b32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB66_1 @@ -3670,24 +3730,26 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_or_b32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB67_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -3695,24 +3757,26 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_or_b32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB67_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -3721,18 +3785,18 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_or_b32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB67_1 @@ -4102,18 +4166,20 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB74_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4125,18 +4191,20 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB74_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4148,18 +4216,20 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB74_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4177,22 +4247,24 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB75_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -4202,22 +4274,24 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB75_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -4225,18 +4299,20 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB75_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4254,18 +4330,18 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_xor_b32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB76_1 @@ -4279,18 +4355,18 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_xor_b32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB76_1 @@ -4304,18 +4380,18 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_xor_b32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB76_1 @@ -4332,24 +4408,26 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_xor_b32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB77_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -4357,24 +4435,26 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_xor_b32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB77_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -4383,18 +4463,18 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_xor_b32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB77_1 @@ -4764,18 +4844,20 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4787,18 +4869,20 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4810,18 +4894,20 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4839,22 +4925,24 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -4864,22 +4952,24 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -4887,18 +4977,20 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4916,18 +5008,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB86_1 @@ -4941,18 +5033,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB86_1 @@ -4966,18 +5058,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB86_1 @@ -4994,24 +5086,26 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -5019,24 +5113,26 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -5045,18 +5141,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB87_1 @@ -5082,19 +5178,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -5112,19 +5210,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -5140,19 +5240,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -5178,25 +5280,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: @@ -5213,25 +5317,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: @@ -5246,25 +5352,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -5286,19 +5394,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -5314,19 +5424,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -5342,19 +5454,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -5377,25 +5491,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: @@ -5410,25 +5526,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: @@ -5443,25 +5561,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -5828,18 +5948,20 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5851,18 +5973,20 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5874,18 +5998,20 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5903,22 +6029,24 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -5928,22 +6056,24 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -5951,18 +6081,20 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5980,18 +6112,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB100_1 @@ -6005,18 +6137,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB100_1 @@ -6030,18 +6162,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB100_1 @@ -6058,24 +6190,26 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -6083,24 +6217,26 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -6109,18 +6245,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB101_1 @@ -6146,19 +6282,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_u32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -6176,19 +6314,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_u32_e32 v0, s2, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -6204,19 +6344,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_max_u32_e32 v0, s2, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -6242,25 +6384,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -6277,25 +6421,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -6310,25 +6456,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -6351,25 +6499,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: @@ -6384,25 +6534,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: @@ -6417,25 +6569,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -6802,18 +6956,20 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6825,18 +6981,20 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6848,18 +7006,20 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6877,22 +7037,24 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -6902,22 +7064,24 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -6925,18 +7089,20 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6954,18 +7120,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB113_1 @@ -6979,18 +7145,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB113_1 @@ -7004,18 +7170,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB113_1 @@ -7032,24 +7198,26 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -7057,24 +7225,26 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -7083,18 +7253,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB114_1 @@ -7464,18 +7634,20 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7487,18 +7659,20 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7510,18 +7684,20 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7539,22 +7715,24 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -7564,22 +7742,24 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -7587,18 +7767,20 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7616,18 +7798,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB123_1 @@ -7641,18 +7823,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB123_1 @@ -7666,18 +7848,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB123_1 @@ -7694,24 +7876,26 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -7719,24 +7903,26 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -7745,18 +7931,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB124_1 @@ -7782,19 +7968,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -7812,19 +8000,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_i32_e32 v0, s2, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -7840,19 +8030,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_min_i32_e32 v0, s2, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -7878,25 +8070,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: @@ -7913,25 +8107,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: @@ -7946,25 +8142,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -7977,72 +8175,78 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb +; GCN1-NEXT: s_mov_b64 s[2:3], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c +; GCN2-NEXT: s_mov_b64 s[2:3], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -8064,25 +8268,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: @@ -8097,25 +8303,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: @@ -8130,25 +8338,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -8539,20 +8749,22 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB135_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3 -; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 -; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB135_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8564,20 +8776,22 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB135_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 -; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB135_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8589,20 +8803,22 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB135_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_add_u32_e32 v2, 1, v3 -; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 -; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB135_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8620,24 +8836,26 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB136_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3 -; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 -; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB136_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -8647,24 +8865,26 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB136_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 -; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB136_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -8672,20 +8892,22 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB136_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_add_u32_e32 v2, 1, v3 -; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 -; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB136_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8703,20 +8925,20 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v4 -; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 -; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB137_1 @@ -8730,20 +8952,20 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v4 -; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 -; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB137_1 @@ -8757,20 +8979,20 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_add_u32_e32 v0, 1, v4 -; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 -; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB137_1 @@ -8787,26 +9009,28 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB138_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v4 -; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 -; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB138_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -8814,26 +9038,28 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB138_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v4 -; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 -; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB138_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -8842,20 +9068,20 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB138_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_add_u32_e32 v0, 1, v4 -; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 -; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB138_1 @@ -9273,23 +9499,25 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s6 ; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB145_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9301,23 +9529,25 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s6 ; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB145_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9329,23 +9559,25 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s6 ; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 -; GCN3-NEXT: v_add_u32_e32 v2, -1, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB145_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9359,61 +9591,65 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 16 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 ; GCN1-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s36 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v3, s37 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_cbranch_execnz .LBB146_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 16 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 ; GCN2-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s36 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v3, s37 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_cbranch_execnz .LBB146_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: @@ -9421,23 +9657,25 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s6 ; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 -; GCN3-NEXT: v_add_u32_e32 v2, -1, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB146_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9455,23 +9693,23 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s6 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v5, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB147_1 @@ -9485,23 +9723,23 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s6 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v5, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB147_1 @@ -9515,23 +9753,23 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v3, s6 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v5, v0 -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 -; GCN3-NEXT: v_add_u32_e32 v0, -1, v5 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v4, -1, v1 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB147_1 @@ -9546,61 +9784,65 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 16 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 ; GCN1-NEXT: .LBB148_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v5, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s36 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc +; GCN1-NEXT: v_mov_b32_e32 v3, s37 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_cbranch_execnz .LBB148_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 16 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 ; GCN2-NEXT: .LBB148_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v5, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s36 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc +; GCN2-NEXT: v_mov_b32_e32 v3, s37 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_cbranch_execnz .LBB148_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: @@ -9609,23 +9851,23 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v3, s6 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB148_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v5, v0 -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 -; GCN3-NEXT: v_add_u32_e32 v0, -1, v5 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v4, -1, v1 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB148_1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index 757649ca592b3..d5b7cb01f2f8a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -455,26 +455,28 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_and_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -482,26 +484,28 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_and_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -527,64 +531,68 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_offset: @@ -619,23 +627,25 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -650,23 +660,25 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -705,27 +717,29 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_addr64_offset: @@ -739,27 +753,29 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: @@ -794,13 +810,13 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -821,13 +837,13 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -865,19 +881,19 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s5, v7 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 @@ -897,19 +913,19 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_and_b32_e32 v5, s5, v7 -; GFX8-NEXT: v_and_b32_e32 v4, s4, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 @@ -949,23 +965,25 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -978,23 +996,25 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_and_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -1030,27 +1050,29 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret_addr64: @@ -1062,27 +1084,29 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: @@ -1111,27 +1135,29 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_sub_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -1139,27 +1165,29 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_sub_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -1185,66 +1213,70 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 -; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s4, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_offset: @@ -1279,24 +1311,26 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -1311,24 +1345,26 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -1367,28 +1403,30 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 -; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s4, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset: @@ -1402,28 +1440,30 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: @@ -1458,14 +1498,14 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -1486,14 +1526,14 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -1531,20 +1571,20 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-NEXT: v_mov_b32_e32 v7, v0 -; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 -; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s4, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 @@ -1564,20 +1604,20 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 @@ -1617,24 +1657,26 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -1647,24 +1689,26 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -1700,28 +1744,30 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 -; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s4, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret_addr64: @@ -1733,28 +1779,30 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: @@ -1783,28 +1831,30 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_max_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -1812,28 +1862,30 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_max_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -1859,68 +1911,72 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_offset: @@ -1955,25 +2011,27 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -1988,25 +2046,27 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -2045,29 +2105,31 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: @@ -2081,29 +2143,31 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: @@ -2138,16 +2202,16 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -2167,16 +2231,16 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -2213,21 +2277,21 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 @@ -2247,21 +2311,21 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB29_1 @@ -2301,25 +2365,27 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -2332,25 +2398,27 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -2386,29 +2454,31 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: @@ -2420,29 +2490,31 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: @@ -2471,28 +2543,30 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_umax_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -2500,28 +2574,30 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_umax_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -2547,68 +2623,72 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_offset: @@ -2643,25 +2723,27 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -2676,25 +2758,27 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -2733,29 +2817,31 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -2769,29 +2855,31 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -2826,16 +2914,16 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -2855,16 +2943,16 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -2901,21 +2989,21 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 @@ -2935,21 +3023,21 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB37_1 @@ -2989,25 +3077,27 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -3020,25 +3110,27 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -3074,29 +3166,31 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: @@ -3108,29 +3202,31 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: @@ -3159,28 +3255,30 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_min_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -3188,28 +3286,30 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_min_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -3235,68 +3335,72 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_offset: @@ -3331,25 +3435,27 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -3364,25 +3470,27 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -3421,29 +3529,31 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: @@ -3457,29 +3567,31 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: @@ -3514,16 +3626,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3543,16 +3655,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3589,21 +3701,21 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 @@ -3623,21 +3735,21 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB45_1 @@ -3677,25 +3789,27 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -3708,25 +3822,27 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -3762,29 +3878,31 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: @@ -3796,29 +3914,31 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: @@ -3847,28 +3967,30 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_umin_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -3876,28 +3998,30 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_umin_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -3923,68 +4047,72 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_offset: @@ -4019,25 +4147,27 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -4052,25 +4182,27 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -4109,29 +4241,31 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset: @@ -4145,29 +4279,31 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: @@ -4202,16 +4338,16 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4231,16 +4367,16 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4277,21 +4413,21 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB53_1 @@ -4311,21 +4447,21 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 @@ -4365,25 +4501,27 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -4396,25 +4534,27 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -4450,29 +4590,31 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret_addr64: @@ -4484,29 +4626,31 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: @@ -4535,26 +4679,28 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_or_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -4562,26 +4708,28 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_or_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -4607,64 +4755,68 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_or_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_or_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_or_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_or_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_or_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_offset: @@ -4699,23 +4851,25 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -4730,23 +4884,25 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -4785,27 +4941,29 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_or_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_or_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_or_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_addr64_offset: @@ -4819,27 +4977,29 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_or_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_or_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: @@ -4874,13 +5034,13 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -4901,13 +5061,13 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -4945,19 +5105,19 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_or_b32_e32 v5, s5, v7 -; GFX7-NEXT: v_or_b32_e32 v4, s4, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_or_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB61_1 @@ -4977,19 +5137,19 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_or_b32_e32 v5, s5, v7 -; GFX8-NEXT: v_or_b32_e32 v4, s4, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_or_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 @@ -5029,23 +5189,25 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB62_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -5058,23 +5220,25 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB62_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -5110,27 +5274,29 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_or_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_or_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_or_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB63_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret_addr64: @@ -5142,27 +5308,29 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_or_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_or_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB63_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: @@ -5733,26 +5901,28 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_xor_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB74_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -5760,26 +5930,28 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_xor_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB74_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -5805,64 +5977,68 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_xor_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB75_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_xor_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB75_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_offset: @@ -5897,23 +6073,25 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB76_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -5928,23 +6106,25 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB76_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -5983,27 +6163,29 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_xor_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB77_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset: @@ -6017,27 +6199,29 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_xor_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB77_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: @@ -6072,13 +6256,13 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6099,13 +6283,13 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -6143,19 +6327,19 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_xor_b32_e32 v5, s5, v7 -; GFX7-NEXT: v_xor_b32_e32 v4, s4, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_xor_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB79_1 @@ -6175,19 +6359,19 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_xor_b32_e32 v5, s5, v7 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_xor_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB79_1 @@ -6227,23 +6411,25 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3 ; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB80_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -6256,23 +6442,25 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3 ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB80_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -6308,27 +6496,29 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5 -; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_xor_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s4, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB81_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret_addr64: @@ -6340,27 +6530,29 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_xor_b32_e32 v1, s5, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB81_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: @@ -7757,19 +7949,21 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_inc_i64_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7777,9 +7971,9 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB107_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -7787,19 +7981,21 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_inc_i64_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7807,9 +8003,9 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB107_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -7835,70 +8031,74 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB108_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB108_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_offset: @@ -7933,16 +8133,18 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7950,9 +8152,9 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB109_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -7967,16 +8169,18 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7984,9 +8188,9 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB109_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -8025,30 +8229,32 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB110_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset: @@ -8062,30 +8268,32 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB110_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: @@ -8120,14 +8328,14 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8150,14 +8358,14 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8196,29 +8404,31 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB112_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret: @@ -8229,29 +8439,31 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB112_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret: @@ -8283,16 +8495,18 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8300,9 +8514,9 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB113_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -8315,16 +8529,18 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8332,9 +8548,9 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB113_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -8370,30 +8586,32 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB114_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret_incr64: @@ -8405,30 +8623,32 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB114_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB114_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: @@ -8456,24 +8676,26 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_dec_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s4, 32 -; GFX7-NEXT: s_addc_u32 s1, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_add_u32 s4, s0, 32 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2 -; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8481,33 +8703,35 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB115_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_u32 s0, s4, 32 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2 -; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8515,9 +8739,9 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB115_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -8543,78 +8767,82 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s8, 32 -; GFX7-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: s_add_u32 s6, s8, 32 +; GFX7-NEXT: s_addc_u32 s7, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] -; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8 -; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v7, s[2:3], -1, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] ; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB116_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: s_add_u32 s6, s8, 32 +; GFX8-NEXT: s_addc_u32 s7, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] -; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8 -; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] ; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB116_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_offset: @@ -8641,28 +8869,30 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_decr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s4 +; GFX7-NEXT: s_addc_u32 s1, s1, s5 +; GFX7-NEXT: s_add_u32 s4, s0, 32 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2 -; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8670,37 +8900,39 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB117_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_decr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 -; GFX8-NEXT: s_add_u32 s0, s0, 32 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s4 +; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2 -; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8708,9 +8940,9 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB117_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -8745,38 +8977,40 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 ; GFX7-NEXT: s_add_u32 s0, s4, s0 ; GFX7-NEXT: s_addc_u32 s1, s5, s1 -; GFX7-NEXT: s_add_u32 s0, s0, 32 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s9 -; GFX7-NEXT: v_mov_b32_e32 v5, s8 +; GFX7-NEXT: s_add_u32 s4, s0, 32 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9] -; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8 -; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v7, s[2:3], -1, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] ; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_cbranch_execnz .LBB118_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset: @@ -8786,38 +9020,40 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 ; GFX8-NEXT: s_add_u32 s0, s4, s0 ; GFX8-NEXT: s_addc_u32 s1, s5, s1 -; GFX8-NEXT: s_add_u32 s0, s0, 32 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9] -; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8 -; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] ; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_cbranch_execnz .LBB118_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: @@ -8847,23 +9083,23 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_dec_i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2 -; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8871,9 +9107,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_cbranch_execnz .LBB119_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -8881,23 +9117,23 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX8-LABEL: atomic_dec_i64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2 -; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8905,9 +9141,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB119_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -8936,33 +9172,35 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] -; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8 -; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v7, s[2:3], -1, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] ; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB120_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret: @@ -8973,33 +9211,35 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9] -; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8 -; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] ; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB120_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret: @@ -9025,26 +9265,28 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_decr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX7-NEXT: s_add_u32 s4, s0, s4 +; GFX7-NEXT: s_addc_u32 s5, s1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2 -; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -9052,35 +9294,37 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB121_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_decr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX8-NEXT: s_add_u32 s4, s0, s4 +; GFX8-NEXT: s_addc_u32 s5, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2 -; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -9088,9 +9332,9 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB121_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -9122,38 +9366,40 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s9 -; GFX7-NEXT: v_mov_b32_e32 v5, s8 +; GFX7-NEXT: s_add_u32 s4, s4, s0 +; GFX7-NEXT: s_addc_u32 s5, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9] -; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8 -; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v7, s[2:3], -1, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] ; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_cbranch_execnz .LBB122_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret_decr64: @@ -9161,38 +9407,40 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: s_add_u32 s4, s4, s0 +; GFX8-NEXT: s_addc_u32 s5, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9] -; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8 -; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3] ; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_cbranch_execnz .LBB122_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 524100c5b7a25..1cd67a80a22f7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -4367,12 +4367,14 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: .LBB34_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 -; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4430,12 +4432,14 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: .LBB34_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 -; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4481,16 +4485,18 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB34_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: .LBB34_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 -; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4551,12 +4557,14 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: .LBB35_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 -; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4616,12 +4624,14 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: .LBB35_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 -; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4669,16 +4679,18 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB35_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB35_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: .LBB35_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 -; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4732,18 +4744,20 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v8, v1 -; GCN1-NEXT: v_mov_b32_e32 v7, v0 -; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 -; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB36_2 @@ -4793,18 +4807,20 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v8, v1 -; GCN2-NEXT: v_mov_b32_e32 v7, v0 -; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 -; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB36_2 @@ -4842,22 +4858,24 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB36_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v8, v1 -; GCN3-NEXT: v_mov_b32_e32 v7, v0 -; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 -; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB36_2 @@ -4910,18 +4928,20 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v8, v1 -; GCN1-NEXT: v_mov_b32_e32 v7, v0 -; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 -; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB37_2 @@ -4973,18 +4993,20 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v8, v1 -; GCN2-NEXT: v_mov_b32_e32 v7, v0 -; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 -; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB37_2 @@ -5024,22 +5046,24 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB37_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v8, v1 -; GCN3-NEXT: v_mov_b32_e32 v7, v0 -; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 -; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB37_2 @@ -6080,9 +6104,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB44_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 ; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6141,9 +6167,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB44_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 ; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6188,15 +6216,17 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB44_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 ; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6258,9 +6288,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB45_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 ; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6321,9 +6353,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB45_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 ; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6370,15 +6404,17 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB45_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB45_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB45_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 ; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6434,14 +6470,16 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: .LBB46_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_and_b32_e32 v5, s7, v7 -; GCN1-NEXT: v_and_b32_e32 v4, s6, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB46_2 @@ -6493,14 +6531,16 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: .LBB46_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_and_b32_e32 v5, s7, v7 -; GCN2-NEXT: v_and_b32_e32 v4, s6, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB46_2 @@ -6537,21 +6577,23 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB46_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB46_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_and_b32_e32 v5, s7, v7 -; GCN3-NEXT: v_and_b32_e32 v4, s6, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB46_2 @@ -6606,14 +6648,16 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: .LBB47_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_and_b32_e32 v5, s7, v7 -; GCN1-NEXT: v_and_b32_e32 v4, s6, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB47_2 @@ -6667,14 +6711,16 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: .LBB47_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_and_b32_e32 v5, s7, v7 -; GCN2-NEXT: v_and_b32_e32 v4, s6, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB47_2 @@ -6713,21 +6759,23 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB47_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB47_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_and_b32_e32 v5, s7, v7 -; GCN3-NEXT: v_and_b32_e32 v4, s6, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB47_2 @@ -7818,6 +7866,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: v_not_b32_e32 v1, v0 ; GCN1-NEXT: v_not_b32_e32 v0, v6 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7883,6 +7933,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: v_not_b32_e32 v1, v0 ; GCN2-NEXT: v_not_b32_e32 v0, v6 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7931,15 +7983,17 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: v_not_b32_e32 v1, v0 ; GCN3-NEXT: v_not_b32_e32 v0, v6 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8008,6 +8062,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: v_not_b32_e32 v1, v0 ; GCN1-NEXT: v_not_b32_e32 v0, v6 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8075,6 +8131,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: v_not_b32_e32 v1, v0 ; GCN2-NEXT: v_not_b32_e32 v0, v6 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8125,15 +8183,17 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB55_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB55_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: v_not_b32_e32 v1, v0 ; GCN3-NEXT: v_not_b32_e32 v0, v6 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8193,16 +8253,18 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_not_b32_e32 v1, v0 +; GCN1-NEXT: v_not_b32_e32 v0, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB56_2 @@ -8256,16 +8318,18 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN2-NEXT: v_not_b32_e32 v5, v0 -; GCN2-NEXT: v_not_b32_e32 v4, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_not_b32_e32 v1, v0 +; GCN2-NEXT: v_not_b32_e32 v0, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB56_2 @@ -8304,23 +8368,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB56_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN3-NEXT: v_not_b32_e32 v5, v0 -; GCN3-NEXT: v_not_b32_e32 v4, v1 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_not_b32_e32 v1, v0 +; GCN3-NEXT: v_not_b32_e32 v0, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB56_2 @@ -8377,16 +8443,18 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_not_b32_e32 v1, v0 +; GCN1-NEXT: v_not_b32_e32 v0, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB57_2 @@ -8442,16 +8510,18 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN2-NEXT: v_not_b32_e32 v5, v0 -; GCN2-NEXT: v_not_b32_e32 v4, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_not_b32_e32 v1, v0 +; GCN2-NEXT: v_not_b32_e32 v0, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB57_2 @@ -8492,23 +8562,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB57_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN3-NEXT: v_not_b32_e32 v5, v0 -; GCN3-NEXT: v_not_b32_e32 v4, v1 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_not_b32_e32 v1, v0 +; GCN3-NEXT: v_not_b32_e32 v0, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB57_2 @@ -9682,9 +9754,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB64_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 ; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -9743,9 +9817,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB64_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 ; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9790,15 +9866,17 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB64_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 ; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -9860,9 +9938,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB65_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 ; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -9923,9 +10003,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB65_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 ; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9972,15 +10054,17 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB65_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 ; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -10036,14 +10120,16 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: .LBB66_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_or_b32_e32 v5, s7, v7 -; GCN1-NEXT: v_or_b32_e32 v4, s6, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB66_2 @@ -10095,14 +10181,16 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: .LBB66_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_or_b32_e32 v5, s7, v7 -; GCN2-NEXT: v_or_b32_e32 v4, s6, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB66_2 @@ -10139,21 +10227,23 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB66_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB66_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_or_b32_e32 v5, s7, v7 -; GCN3-NEXT: v_or_b32_e32 v4, s6, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB66_2 @@ -10208,14 +10298,16 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: .LBB67_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_or_b32_e32 v5, s7, v7 -; GCN1-NEXT: v_or_b32_e32 v4, s6, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB67_2 @@ -10269,14 +10361,16 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: .LBB67_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_or_b32_e32 v5, s7, v7 -; GCN2-NEXT: v_or_b32_e32 v4, s6, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB67_2 @@ -10315,21 +10409,23 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB67_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB67_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_or_b32_e32 v5, s7, v7 -; GCN3-NEXT: v_or_b32_e32 v4, s6, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB67_2 @@ -11369,9 +11465,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB74_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 ; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11430,9 +11528,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB74_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 ; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11477,15 +11577,17 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB74_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 ; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -11547,9 +11649,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB75_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 ; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11610,9 +11714,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB75_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 ; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11659,15 +11765,17 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB75_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 ; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -11723,14 +11831,16 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: .LBB76_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_xor_b32_e32 v5, s7, v7 -; GCN1-NEXT: v_xor_b32_e32 v4, s6, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB76_2 @@ -11782,14 +11892,16 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: .LBB76_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_xor_b32_e32 v5, s7, v7 -; GCN2-NEXT: v_xor_b32_e32 v4, s6, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB76_2 @@ -11826,21 +11938,23 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB76_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB76_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_xor_b32_e32 v5, s7, v7 -; GCN3-NEXT: v_xor_b32_e32 v4, s6, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB76_2 @@ -11895,14 +12009,16 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: .LBB77_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_xor_b32_e32 v5, s7, v7 -; GCN1-NEXT: v_xor_b32_e32 v4, s6, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB77_2 @@ -11956,14 +12072,16 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: .LBB77_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_xor_b32_e32 v5, s7, v7 -; GCN2-NEXT: v_xor_b32_e32 v4, s6, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB77_2 @@ -12002,21 +12120,23 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB77_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB77_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_xor_b32_e32 v5, s7, v7 -; GCN3-NEXT: v_xor_b32_e32 v4, s6, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB77_2 @@ -13054,14 +13174,16 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -13120,14 +13242,16 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -13174,18 +13298,20 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB84_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB84_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -13247,14 +13373,16 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -13315,14 +13443,16 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -13371,18 +13501,20 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB85_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB85_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -13437,20 +13569,22 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB86_2 @@ -13501,20 +13635,22 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB86_2 @@ -13553,24 +13689,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB86_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB86_2 @@ -13624,20 +13762,22 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB87_2 @@ -13690,20 +13830,22 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB87_2 @@ -13744,24 +13886,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB87_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB87_2 @@ -13822,18 +13966,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -13893,18 +14039,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -13963,18 +14111,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -14031,24 +14181,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB89_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB89_2 @@ -14102,24 +14254,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB89_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB89_2 @@ -14172,24 +14326,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB89_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB89_2 @@ -14253,18 +14409,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14322,18 +14480,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14390,18 +14550,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -14455,24 +14617,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB91_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB91_2 @@ -14524,24 +14688,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB91_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB91_2 @@ -14592,24 +14758,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB91_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB91_2 @@ -15653,14 +15821,16 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -15719,14 +15889,16 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15773,18 +15945,20 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB98_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB98_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -15846,14 +16020,16 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -15914,14 +16090,16 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15970,18 +16148,20 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB99_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB99_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -16036,20 +16216,22 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB100_2 @@ -16100,20 +16282,22 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB100_2 @@ -16152,24 +16336,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB100_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB100_2 @@ -16223,20 +16409,22 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB101_2 @@ -16289,20 +16477,22 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB101_2 @@ -16343,24 +16533,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB101_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB101_2 @@ -16421,18 +16613,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -16492,18 +16686,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -16562,18 +16758,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -16630,24 +16828,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB103_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB103_2 @@ -16701,24 +16901,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB103_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB103_2 @@ -16771,24 +16973,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB103_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB103_2 @@ -16845,24 +17049,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB104_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB104_2 @@ -16914,24 +17120,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB104_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB104_2 @@ -16982,24 +17190,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB104_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB104_2 @@ -18043,14 +18253,16 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -18109,14 +18321,16 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -18163,18 +18377,20 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB111_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB111_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -18236,14 +18452,16 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -18304,14 +18522,16 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -18360,18 +18580,20 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB112_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB112_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -18426,20 +18648,22 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB113_2 @@ -18490,20 +18714,22 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB113_2 @@ -18542,24 +18768,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB113_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB113_2 @@ -18613,20 +18841,22 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB114_2 @@ -18679,20 +18909,22 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB114_2 @@ -18733,24 +18965,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB114_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB114_2 @@ -19790,14 +20024,16 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -19856,14 +20092,16 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -19910,18 +20148,20 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB121_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB121_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -19983,14 +20223,16 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -20051,14 +20293,16 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -20107,18 +20351,20 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB122_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB122_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -20173,20 +20419,22 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB123_2 @@ -20237,20 +20485,22 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB123_2 @@ -20289,24 +20539,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB123_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB123_2 @@ -20360,20 +20612,22 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB124_2 @@ -20426,20 +20680,22 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB124_2 @@ -20480,24 +20736,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB124_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB124_2 @@ -20558,18 +20816,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -20629,18 +20889,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -20699,18 +20961,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -20767,24 +21031,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB126_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB126_2 @@ -20838,24 +21104,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB126_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB126_2 @@ -20908,24 +21176,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB126_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB126_2 @@ -20985,18 +21255,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -21050,18 +21322,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -21114,18 +21388,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -21178,24 +21454,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB128_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB128_2 @@ -21247,24 +21525,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB128_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB128_2 @@ -21315,24 +21595,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB128_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB128_2 @@ -22442,6 +22724,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -22509,6 +22793,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -22558,9 +22844,9 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: .LBB135_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB135_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB135_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -22568,6 +22854,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -22638,6 +22926,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -22707,6 +22997,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -22758,9 +23050,9 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: .LBB136_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB136_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB136_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -22768,6 +23060,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -22828,17 +23122,19 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB137_2 @@ -22893,17 +23189,19 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB137_2 @@ -22943,24 +23241,26 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB137_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB137_2 @@ -23018,17 +23318,19 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB138_2 @@ -23085,17 +23387,19 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB138_2 @@ -23137,24 +23441,26 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB138_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB138_2 @@ -24314,17 +24620,19 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] -; GCN1-NEXT: s_mov_b64 s[38:39], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB145_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24332,12 +24640,12 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB145_4 ; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_branch .LBB145_2 ; GCN1-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 @@ -24388,17 +24696,19 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[38:39], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB145_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24406,12 +24716,12 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB145_4 ; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_branch .LBB145_2 ; GCN2-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -24451,20 +24761,22 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: .LBB145_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB145_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN3-NEXT: s_mov_b64 s[38:39], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB145_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24472,12 +24784,12 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB145_4 ; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_branch .LBB145_2 ; GCN3-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -24509,10 +24821,10 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 -; GCN1-NEXT: s_add_u32 s38, s4, 32 -; GCN1-NEXT: s_addc_u32 s39, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 32 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s39, s34 +; GCN1-NEXT: s_cmp_eq_u32 s37, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 @@ -24523,25 +24835,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB146_3: ; %atomicrmw.global -; GCN1-NEXT: s_add_u32 s34, s38, 4 -; GCN1-NEXT: s_addc_u32 s35, s39, 0 +; GCN1-NEXT: s_add_u32 s34, s36, 4 +; GCN1-NEXT: s_addc_u32 s35, s37, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v4, s38 -; GCN1-NEXT: v_mov_b32_e32 v5, s39 +; GCN1-NEXT: v_mov_b32_e32 v4, s36 +; GCN1-NEXT: v_mov_b32_e32 v5, s37 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] -; GCN1-NEXT: s_mov_b64 s[40:41], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: s_mov_b64 s[38:39], 0 ; GCN1-NEXT: .LBB146_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s36 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s37 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24549,18 +24863,18 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_cbranch_execnz .LBB146_4 ; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_branch .LBB146_2 ; GCN1-NEXT: .LBB146_6: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec -; GCN1-NEXT: s_cselect_b32 s34, s38, -1 +; GCN1-NEXT: s_cselect_b32 s34, s36, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s34 @@ -24585,10 +24899,10 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 -; GCN2-NEXT: s_add_u32 s38, s4, 32 -; GCN2-NEXT: s_addc_u32 s39, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 32 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s39, s34 +; GCN2-NEXT: s_cmp_eq_u32 s37, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 @@ -24599,25 +24913,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB146_3: ; %atomicrmw.global -; GCN2-NEXT: s_add_u32 s34, s38, 4 -; GCN2-NEXT: s_addc_u32 s35, s39, 0 +; GCN2-NEXT: s_add_u32 s34, s36, 4 +; GCN2-NEXT: s_addc_u32 s35, s37, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v4, s38 -; GCN2-NEXT: v_mov_b32_e32 v5, s39 +; GCN2-NEXT: v_mov_b32_e32 v4, s36 +; GCN2-NEXT: v_mov_b32_e32 v5, s37 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[40:41], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: s_mov_b64 s[38:39], 0 ; GCN2-NEXT: .LBB146_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s36 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s37 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24625,16 +24941,16 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_cbranch_execnz .LBB146_4 ; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_branch .LBB146_2 ; GCN2-NEXT: .LBB146_6: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0 -; GCN2-NEXT: s_cselect_b32 s34, s38, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[36:37], 0 +; GCN2-NEXT: s_cselect_b32 s34, s36, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s34 @@ -24658,10 +24974,10 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s38, s4, 32 -; GCN3-NEXT: s_addc_u32 s39, s5, 0 +; GCN3-NEXT: s_add_u32 s36, s4, 32 +; GCN3-NEXT: s_addc_u32 s37, s5, 0 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base -; GCN3-NEXT: s_cmp_eq_u32 s39, s35 +; GCN3-NEXT: s_cmp_eq_u32 s37, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 @@ -24672,20 +24988,22 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB146_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s38 -; GCN3-NEXT: v_mov_b32_e32 v5, s39 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN3-NEXT: s_mov_b64 s[40:41], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s36 +; GCN3-NEXT: v_mov_b32_e32 v1, s37 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[38:39], 0 ; GCN3-NEXT: .LBB146_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s36 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s37 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24693,16 +25011,16 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_cbranch_execnz .LBB146_4 ; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_branch .LBB146_2 ; GCN3-NEXT: .LBB146_6: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0 -; GCN3-NEXT: s_cselect_b32 s34, s38, -1 +; GCN3-NEXT: s_cmp_lg_u64 s[36:37], 0 +; GCN3-NEXT: s_cselect_b32 s34, s36, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 @@ -24746,24 +25064,26 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[38:39], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB147_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v7, s[36:37], -1, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_cbranch_execnz .LBB147_2 @@ -24819,24 +25139,26 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[38:39], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB147_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_cbranch_execnz .LBB147_2 @@ -24880,28 +25202,30 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB147_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[38:39], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB147_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v7, s[36:37], -1, v2 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_cbranch_execnz .LBB147_2 @@ -24960,24 +25284,26 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[40:41], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB148_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v7, s[36:37], -1, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s38 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v5, s39 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] ; GCN1-NEXT: s_cbranch_execnz .LBB148_2 @@ -25035,24 +25361,26 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[40:41], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB148_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s38 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v5, s39 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] ; GCN2-NEXT: s_cbranch_execnz .LBB148_2 @@ -25098,28 +25426,30 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB148_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s38 -; GCN3-NEXT: v_mov_b32_e32 v3, s39 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s38 +; GCN3-NEXT: v_mov_b32_e32 v1, s39 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[40:41], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB148_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v7, s[36:37], -1, v2 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s38 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v5, s39 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41] ; GCN3-NEXT: s_cbranch_execnz .LBB148_2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index 4dea4495b36fb..e1561cd71e495 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -1439,15 +1439,15 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -1472,15 +1472,15 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -1500,15 +1500,15 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1539,24 +1539,26 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret_offset_scalar: @@ -1572,24 +1574,26 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar: @@ -1598,15 +1602,15 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1636,21 +1640,21 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-NEXT: v_mov_b32_e32 v7, v0 -; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 -; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 @@ -1669,21 +1673,21 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB36_1 @@ -1697,21 +1701,21 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB36_1 @@ -1736,24 +1740,26 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-NEXT: v_mov_b32_e32 v7, v0 -; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 -; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret_offset_scalar: @@ -1769,24 +1775,26 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar: @@ -1795,21 +1803,21 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB37_1 @@ -2237,14 +2245,14 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -2269,14 +2277,14 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -2296,14 +2304,14 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -2334,23 +2342,25 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 ; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret_offset_scalar: @@ -2366,23 +2376,25 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar: @@ -2391,14 +2403,14 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -2428,20 +2440,20 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s7, v7 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 @@ -2460,20 +2472,20 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_and_b32_e32 v5, s7, v7 -; GFX8-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -2487,20 +2499,20 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB46_1 @@ -2525,23 +2537,25 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s7, v7 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret_offset_scalar: @@ -2557,23 +2571,25 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_and_b32_e32 v5, s7, v7 -; GFX8-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar: @@ -2582,20 +2598,20 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB47_1 @@ -3047,14 +3063,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_not_b32_e32 v1, v0 ; GFX7-NEXT: v_not_b32_e32 v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -3081,14 +3097,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v6 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -3110,14 +3126,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v6 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -3150,12 +3166,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: v_not_b32_e32 v1, v0 ; GFX7-NEXT: v_not_b32_e32 v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -3163,12 +3181,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -3184,12 +3202,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v6 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -3197,12 +3217,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -3211,14 +3231,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v6 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc @@ -3250,22 +3270,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX7-NEXT: v_not_b32_e32 v5, v0 -; GFX7-NEXT: v_not_b32_e32 v4, v1 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_not_b32_e32 v1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 @@ -3284,22 +3304,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX8-NEXT: v_not_b32_e32 v5, v0 -; GFX8-NEXT: v_not_b32_e32 v4, v1 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -3313,22 +3333,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v0 -; GFX9-NEXT: v_not_b32_e32 v4, v1 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB56_1 @@ -3353,25 +3373,27 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX7-NEXT: v_not_b32_e32 v5, v0 -; GFX7-NEXT: v_not_b32_e32 v4, v1 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_not_b32_e32 v1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -3387,25 +3409,27 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX8-NEXT: v_not_b32_e32 v5, v0 -; GFX8-NEXT: v_not_b32_e32 v4, v1 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -3414,22 +3438,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v0 -; GFX9-NEXT: v_not_b32_e32 v4, v1 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB57_1 @@ -3973,14 +3997,14 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 ; GFX7-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -4005,14 +4029,14 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 ; GFX8-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -4032,14 +4056,14 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4070,23 +4094,25 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 ; GFX7-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB65_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret_offset_scalar: @@ -4102,23 +4128,25 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 ; GFX8-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB65_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar: @@ -4127,14 +4155,14 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4164,20 +4192,20 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_or_b32_e32 v5, s7, v7 -; GFX7-NEXT: v_or_b32_e32 v4, s6, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB66_1 @@ -4196,20 +4224,20 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_or_b32_e32 v5, s7, v7 -; GFX8-NEXT: v_or_b32_e32 v4, s6, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB66_1 @@ -4223,20 +4251,20 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_or_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_or_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB66_1 @@ -4261,23 +4289,25 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_or_b32_e32 v5, s7, v7 -; GFX7-NEXT: v_or_b32_e32 v4, s6, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB67_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret_offset_scalar: @@ -4293,23 +4323,25 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_or_b32_e32 v5, s7, v7 -; GFX8-NEXT: v_or_b32_e32 v4, s6, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB67_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar: @@ -4318,20 +4350,20 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_or_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_or_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB67_1 @@ -4759,14 +4791,14 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 ; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -4791,14 +4823,14 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 ; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -4818,14 +4850,14 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4856,23 +4888,25 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 ; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB75_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret_offset_scalar: @@ -4888,23 +4922,25 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 ; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB75_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar: @@ -4913,14 +4949,14 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4950,20 +4986,20 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7 -; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB76_1 @@ -4982,20 +5018,20 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB76_1 @@ -5009,20 +5045,20 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB76_1 @@ -5047,23 +5083,25 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7 -; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB77_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret_offset_scalar: @@ -5079,23 +5117,25 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB77_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar: @@ -5104,20 +5144,20 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB77_1 @@ -5557,17 +5597,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -5592,17 +5632,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -5622,17 +5662,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5663,26 +5703,28 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB85_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -5698,26 +5740,28 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -5726,17 +5770,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5766,23 +5810,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB86_1 @@ -5801,23 +5845,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB86_1 @@ -5831,23 +5875,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB86_1 @@ -5872,26 +5916,28 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB87_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -5907,26 +5953,28 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -5935,23 +5983,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB87_1 @@ -5974,26 +6022,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB88_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -6008,26 +6058,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB88_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -6040,26 +6092,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -6082,30 +6136,32 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB89_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: @@ -6119,65 +6175,69 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6196,26 +6256,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB90_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -6228,26 +6290,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB90_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -6260,26 +6324,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -6299,30 +6365,32 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB91_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: @@ -6334,65 +6402,69 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6829,17 +6901,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6864,17 +6936,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -6894,17 +6966,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6935,26 +7007,28 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB99_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -6970,26 +7044,28 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB99_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -6998,17 +7074,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7038,23 +7114,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB100_1 @@ -7073,23 +7149,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB100_1 @@ -7103,23 +7179,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB100_1 @@ -7144,26 +7220,28 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB101_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -7179,26 +7257,28 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB101_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -7207,23 +7287,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB101_1 @@ -7246,26 +7326,28 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB102_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -7280,26 +7362,28 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB102_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -7312,26 +7396,28 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -7354,30 +7440,32 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB103_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -7391,65 +7479,69 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB103_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -7469,30 +7561,32 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB104_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: @@ -7504,65 +7598,69 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB104_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -7999,17 +8097,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -8034,17 +8132,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -8064,17 +8162,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8105,26 +8203,28 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB112_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -8140,26 +8240,28 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB112_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -8168,17 +8270,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8208,23 +8310,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB113_1 @@ -8243,23 +8345,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB113_1 @@ -8273,23 +8375,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB113_1 @@ -8314,26 +8416,28 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB114_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -8349,26 +8453,28 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB114_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -8377,23 +8483,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB114_1 @@ -8833,17 +8939,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -8868,17 +8974,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -8898,17 +9004,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8939,26 +9045,28 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB122_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -8974,26 +9082,28 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB122_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -9002,17 +9112,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -9042,23 +9152,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB123_1 @@ -9077,23 +9187,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB123_1 @@ -9107,23 +9217,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB123_1 @@ -9148,26 +9258,28 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB124_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -9183,26 +9295,28 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB124_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -9211,23 +9325,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB124_1 @@ -9250,26 +9364,28 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB125_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -9284,26 +9400,28 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB125_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -9316,26 +9434,28 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -9358,30 +9478,32 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB126_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: @@ -9395,65 +9517,69 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB126_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -9472,16 +9598,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -9502,16 +9628,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -9532,16 +9658,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -9568,30 +9694,32 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB128_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: @@ -9603,65 +9731,69 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB128_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -10122,15 +10254,15 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -10157,15 +10289,15 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -10187,15 +10319,15 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -10228,13 +10360,15 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -10242,12 +10376,12 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB136_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: @@ -10263,13 +10397,15 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -10277,12 +10413,12 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB136_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: @@ -10291,15 +10427,15 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc @@ -10331,23 +10467,23 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB137_1 @@ -10366,23 +10502,23 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB137_1 @@ -10396,23 +10532,23 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB137_1 @@ -10437,26 +10573,28 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB138_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: @@ -10472,26 +10610,28 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB138_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: @@ -10500,23 +10640,23 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB138_1 @@ -11004,19 +11144,19 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: s_mov_b64 s[38:39], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 -; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -11024,12 +11164,12 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB145_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: @@ -11043,19 +11183,19 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_mov_b64 s[38:39], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 -; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -11063,12 +11203,12 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB145_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: @@ -11077,19 +11217,19 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -11097,12 +11237,12 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_cbranch_execnz .LBB145_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -11112,27 +11252,29 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s34, s4, 32 -; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_add_u32 s36, s4, 32 ; GFX7-NEXT: s_addc_u32 s37, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: s_add_u32 s34, s4, 36 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s34 +; GFX7-NEXT: v_mov_b32_e32 v1, s35 +; GFX7-NEXT: v_mov_b32_e32 v4, s36 +; GFX7-NEXT: v_mov_b32_e32 v5, s37 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] ; GFX7-NEXT: s_mov_b64 s[38:39], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 ; GFX7-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 -; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s36 ; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v5, s37 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -11151,27 +11293,29 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_add_u32 s34, s4, 32 -; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_add_u32 s36, s4, 32 ; GFX8-NEXT: s_addc_u32 s37, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 -; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: s_add_u32 s34, s4, 36 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] ; GFX8-NEXT: s_mov_b64 s[38:39], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 ; GFX8-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 -; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -11193,19 +11337,19 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc @@ -11213,12 +11357,12 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_cbranch_execnz .LBB146_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -11237,27 +11381,27 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[38:39], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 -; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v7, s[36:37], -1, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_cbranch_execnz .LBB147_1 @@ -11276,27 +11420,27 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[38:39], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 -; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_cbranch_execnz .LBB147_1 @@ -11310,27 +11454,27 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v7, s[36:37], -1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB147_1 @@ -11345,79 +11489,83 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s34, s4, 32 +; GFX7-NEXT: s_add_u32 s38, s4, 32 +; GFX7-NEXT: s_addc_u32 s39, s5, 0 +; GFX7-NEXT: s_add_u32 s34, s4, 36 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: s_add_u32 s36, s4, 36 -; GFX7-NEXT: s_addc_u32 s37, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: v_mov_b32_e32 v0, s34 +; GFX7-NEXT: v_mov_b32_e32 v1, s35 +; GFX7-NEXT: v_mov_b32_e32 v2, s38 +; GFX7-NEXT: v_mov_b32_e32 v3, s39 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[38:39], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[40:41], 0 ; GFX7-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 -; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v7, s[36:37], -1, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s38 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v5, s39 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[40:41] ; GFX7-NEXT: s_cbranch_execnz .LBB148_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_or_b64 exec, exec, s[40:41] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_add_u32 s34, s4, 32 +; GFX8-NEXT: s_add_u32 s38, s4, 32 +; GFX8-NEXT: s_addc_u32 s39, s5, 0 +; GFX8-NEXT: s_add_u32 s34, s4, 36 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: s_add_u32 s36, s4, 36 -; GFX8-NEXT: s_addc_u32 s37, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NEXT: v_mov_b32_e32 v3, s39 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[38:39], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[40:41], 0 ; GFX8-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 -; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v5, s39 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[40:41] ; GFX8-NEXT: s_cbranch_execnz .LBB148_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_or_b64 exec, exec, s[40:41] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: @@ -11426,27 +11574,27 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v7, s[36:37], -1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB148_1 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 3856f0c327495..7a250f42f422a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -2025,23 +2025,23 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 9c1f9d21b9da3..a8d9b772ac694 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -20402,42 +20402,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -20452,41 +20451,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20501,34 +20498,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB78_1 @@ -20542,33 +20539,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 @@ -20582,33 +20579,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB78_1 @@ -20622,34 +20619,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB78_1 @@ -20781,42 +20778,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -20831,41 +20827,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20880,34 +20874,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB79_1 @@ -20921,33 +20915,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 @@ -20961,33 +20955,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB79_1 @@ -21003,34 +20997,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB79_1 @@ -21162,42 +21156,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -21212,41 +21205,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21261,34 +21252,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB80_1 @@ -21302,33 +21293,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB80_1 @@ -21342,33 +21333,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB80_1 @@ -21384,34 +21375,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB80_1 @@ -21546,42 +21537,43 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -21594,41 +21586,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21641,35 +21633,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB81_1 @@ -21680,36 +21672,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB81_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21719,36 +21711,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB81_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21758,37 +21750,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB81_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21913,42 +21905,43 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -21961,41 +21954,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22008,35 +22001,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB82_1 @@ -22047,36 +22040,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB82_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22086,36 +22079,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB82_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22127,37 +22120,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB82_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22283,42 +22276,43 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -22331,41 +22325,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22378,35 +22372,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB83_1 @@ -22417,36 +22411,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB83_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22456,36 +22450,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB83_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22497,37 +22491,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB83_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22663,42 +22657,41 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -22713,41 +22706,39 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22762,34 +22753,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB84_1 @@ -22803,35 +22794,35 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB84_1 @@ -22845,33 +22836,33 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB84_1 @@ -22887,34 +22878,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB84_1 @@ -23046,42 +23037,43 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -23094,41 +23086,41 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -23141,35 +23133,35 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB85_1 @@ -23180,38 +23172,38 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -23221,36 +23213,36 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB85_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -23262,37 +23254,37 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -23419,42 +23411,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -23469,41 +23460,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -23518,34 +23507,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB86_1 @@ -23559,33 +23548,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 @@ -23599,33 +23588,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB86_1 @@ -23639,34 +23628,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB86_1 @@ -23797,42 +23786,43 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -23845,41 +23835,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -23892,35 +23882,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB87_1 @@ -23931,36 +23921,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -23970,36 +23960,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB87_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -24009,37 +23999,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -24165,42 +24155,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -24215,41 +24204,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -24264,34 +24251,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB88_1 @@ -24305,33 +24292,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 @@ -24345,33 +24332,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB88_1 @@ -24385,34 +24372,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB88_1 @@ -24543,42 +24530,43 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -24591,41 +24579,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -24638,35 +24626,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB89_1 @@ -24677,36 +24665,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB89_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -24716,36 +24704,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB89_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -24755,37 +24743,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -24911,42 +24899,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -24961,41 +24948,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -25010,34 +24995,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB90_1 @@ -25051,33 +25036,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB90_1 @@ -25091,33 +25076,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB90_1 @@ -25131,34 +25116,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB90_1 @@ -25289,42 +25274,43 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -25337,41 +25323,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -25384,35 +25370,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB91_1 @@ -25423,36 +25409,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB91_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -25462,36 +25448,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB91_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -25501,37 +25487,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index f7cc0709109f9..f4ee49be32390 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -35,13 +35,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -80,13 +80,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -104,13 +104,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -197,13 +197,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -242,13 +242,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -266,13 +266,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -361,13 +361,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -406,13 +406,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -430,13 +430,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -523,21 +523,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -567,20 +567,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -590,20 +590,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -679,21 +679,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -723,20 +723,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -746,20 +746,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -838,21 +838,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -882,20 +882,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -905,20 +905,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1000,13 +1000,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1045,13 +1045,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1071,13 +1071,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1165,21 +1165,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1209,22 +1209,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1234,20 +1234,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1328,13 +1328,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1352,15 +1352,14 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1380,14 +1379,14 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1407,13 +1406,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1431,13 +1430,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1559,13 +1558,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1604,13 +1603,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1628,13 +1627,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1725,13 +1724,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1770,13 +1769,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1794,13 +1793,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1887,13 +1886,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1932,13 +1931,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1956,13 +1955,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2051,13 +2050,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -2096,13 +2095,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2120,13 +2119,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2213,21 +2212,21 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2257,20 +2256,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2280,20 +2279,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2369,21 +2368,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2413,20 +2412,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2436,20 +2435,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2528,21 +2527,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2572,20 +2571,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2595,20 +2594,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2690,13 +2689,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -2735,13 +2734,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2761,13 +2760,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2855,21 +2854,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2899,22 +2898,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2924,20 +2923,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3011,15 +3010,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3049,15 +3048,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3095,15 +3094,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3121,15 +3120,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3184,15 +3183,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3222,15 +3221,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3268,15 +3267,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3296,15 +3295,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3358,15 +3357,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3396,15 +3395,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3442,15 +3441,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3470,15 +3469,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3531,21 +3530,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3568,22 +3567,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3613,21 +3612,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3637,21 +3636,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3694,21 +3693,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3731,22 +3730,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3776,21 +3775,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3802,21 +3801,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3860,21 +3859,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3897,22 +3896,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3942,21 +3941,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3968,21 +3967,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4027,15 +4026,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4065,15 +4064,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4093,15 +4092,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4122,13 +4121,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -4146,15 +4145,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4172,15 +4171,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4198,26 +4197,28 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] +; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 ; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4232,13 +4233,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4246,13 +4248,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v10, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] +; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 ; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4276,15 +4279,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4314,15 +4317,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4360,15 +4363,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4386,15 +4389,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4453,9 +4456,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4467,12 +4469,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -4500,9 +4503,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4514,11 +4516,12 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4551,14 +4554,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -4578,9 +4581,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4592,12 +4594,13 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -4620,9 +4623,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4634,11 +4636,12 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4662,7 +4665,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -4674,9 +4676,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4705,14 +4708,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4739,14 +4742,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -4773,17 +4776,17 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4894,35 +4897,33 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4933,7 +4934,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4947,25 +4948,24 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5000,14 +5000,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -5026,35 +5026,33 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5065,19 +5063,18 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5087,11 +5084,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5116,10 +5114,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5128,9 +5125,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5160,14 +5158,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -5195,14 +5193,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5230,17 +5228,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5355,35 +5353,33 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5394,7 +5390,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5408,25 +5404,24 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5462,14 +5457,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -5488,35 +5483,33 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5527,19 +5520,18 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5549,11 +5541,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5578,10 +5571,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5590,9 +5582,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5622,14 +5615,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -5657,14 +5650,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5692,17 +5685,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5818,9 +5811,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5832,10 +5824,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -5863,9 +5856,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5877,9 +5869,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5913,13 +5906,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v7 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -5939,9 +5932,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5953,10 +5945,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -5979,9 +5972,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5993,9 +5985,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -6020,7 +6013,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6032,8 +6024,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6062,13 +6055,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc @@ -6095,13 +6088,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc @@ -6128,16 +6121,16 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6245,37 +6238,35 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6293,37 +6284,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6341,29 +6331,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6373,38 +6363,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6416,37 +6404,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6458,31 +6446,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -6493,31 +6481,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6527,31 +6515,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6561,32 +6549,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6691,37 +6679,35 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6739,37 +6725,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6788,29 +6773,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6820,38 +6805,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6863,37 +6846,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6905,31 +6888,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -6940,31 +6923,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6974,31 +6957,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7008,32 +6991,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7139,16 +7122,16 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -7175,15 +7158,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7208,14 +7191,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX942-NEXT: v_max_f16_e32 v3, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 @@ -7234,16 +7217,16 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7265,15 +7248,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7296,14 +7279,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7325,14 +7308,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7351,14 +7334,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v5 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -7379,19 +7362,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -7482,15 +7465,15 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7515,24 +7498,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7546,23 +7529,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX942-NEXT: v_max_f16_e32 v3, v4, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7573,15 +7556,15 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7601,25 +7584,25 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7631,23 +7614,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -7658,22 +7641,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v3, v4, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7683,22 +7666,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7710,22 +7693,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7812,36 +7795,34 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7852,7 +7833,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7866,25 +7847,24 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7920,14 +7900,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 @@ -7946,35 +7926,33 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7985,19 +7963,18 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8007,11 +7984,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8036,10 +8014,9 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8048,9 +8025,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8080,14 +8058,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -8117,14 +8095,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -8152,17 +8130,17 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -8277,38 +8255,36 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8326,38 +8302,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8375,29 +8350,29 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8407,38 +8382,36 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8450,37 +8423,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8492,31 +8465,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -8527,33 +8500,33 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8563,31 +8536,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8597,32 +8570,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13627,15 +13600,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -13657,14 +13630,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -13682,15 +13655,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13710,14 +13683,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13737,13 +13710,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -13761,13 +13734,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13785,21 +13758,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -13920,15 +13893,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -13950,14 +13923,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -13975,15 +13948,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -14003,14 +13976,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -14030,13 +14003,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14054,13 +14027,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14080,21 +14053,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -14215,15 +14188,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -14245,14 +14218,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -14270,15 +14243,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -14298,14 +14271,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -14325,13 +14298,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14349,13 +14322,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14375,21 +14348,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -14513,21 +14486,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14541,22 +14514,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB49_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14566,22 +14539,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14593,21 +14566,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -14618,20 +14591,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14641,20 +14614,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14664,24 +14637,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14795,21 +14768,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14823,22 +14796,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14848,22 +14821,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14875,21 +14848,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -14900,20 +14873,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14923,20 +14896,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14948,24 +14921,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15080,21 +15053,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15108,22 +15081,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15133,22 +15106,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15160,21 +15133,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15185,20 +15158,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15208,20 +15181,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15233,24 +15206,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15374,15 +15347,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -15405,14 +15378,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -15430,15 +15403,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -15458,14 +15431,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -15485,13 +15458,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -15511,13 +15484,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15537,21 +15510,21 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15671,22 +15644,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15700,22 +15673,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15725,22 +15698,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15752,21 +15725,21 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -15777,22 +15750,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15802,20 +15775,20 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15827,24 +15800,24 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15964,41 +15937,40 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16018,39 +15990,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16067,35 +16038,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 @@ -16108,42 +16079,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16158,41 +16128,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16207,34 +16175,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -16248,33 +16216,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -16288,33 +16256,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -16328,34 +16296,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -16468,41 +16436,40 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16522,39 +16489,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16571,35 +16537,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 @@ -16612,42 +16578,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16662,41 +16627,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16711,34 +16674,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -16752,33 +16715,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -16792,33 +16755,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -16834,34 +16797,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -16974,41 +16937,40 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17028,39 +16990,38 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17077,35 +17038,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 @@ -17118,42 +17079,41 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17168,41 +17128,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17217,34 +17175,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -17258,33 +17216,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -17298,33 +17256,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -17340,34 +17298,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -17483,41 +17441,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17535,40 +17493,40 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17582,38 +17540,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB57_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17623,42 +17581,43 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17671,41 +17630,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17718,35 +17677,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -17757,36 +17716,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17796,36 +17755,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17835,37 +17794,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17971,41 +17930,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18023,40 +17982,40 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18070,38 +18029,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB58_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18111,42 +18070,43 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18159,41 +18119,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18206,35 +18166,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -18245,36 +18205,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18284,36 +18244,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18325,37 +18285,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18462,41 +18422,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18514,40 +18474,40 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18561,38 +18521,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB59_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18602,42 +18562,43 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18650,41 +18611,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18697,35 +18658,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -18736,36 +18697,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18775,36 +18736,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18816,37 +18777,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18962,42 +18923,41 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -19017,40 +18977,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -19067,35 +19026,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 @@ -19108,42 +19067,41 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19158,41 +19116,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19207,34 +19163,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -19248,35 +19204,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -19290,33 +19246,33 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -19332,34 +19288,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -19471,42 +19427,42 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -19524,41 +19480,41 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -19572,38 +19528,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB61_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19613,42 +19569,43 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19661,41 +19618,41 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19708,35 +19665,35 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -19747,38 +19704,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19788,36 +19745,36 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19829,37 +19786,37 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index b81af1fc9233d..e0bd727b864da 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -35,13 +35,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -80,13 +80,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -104,13 +104,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -197,13 +197,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -242,13 +242,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -266,13 +266,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -361,13 +361,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -406,13 +406,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -430,13 +430,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -523,21 +523,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -567,20 +567,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -590,20 +590,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -679,21 +679,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -723,20 +723,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -746,20 +746,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -838,21 +838,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -882,20 +882,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -905,20 +905,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1000,13 +1000,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1045,13 +1045,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1071,13 +1071,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1165,21 +1165,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX942-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1209,22 +1209,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1234,20 +1234,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1328,13 +1328,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1352,15 +1352,14 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1380,14 +1379,14 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1407,13 +1406,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1431,13 +1430,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1559,13 +1558,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1604,13 +1603,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1628,13 +1627,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1725,13 +1724,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1770,13 +1769,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1794,13 +1793,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1887,13 +1886,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -1932,13 +1931,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1956,13 +1955,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2051,13 +2050,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -2096,13 +2095,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2120,13 +2119,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2213,21 +2212,21 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB13_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2257,20 +2256,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2280,20 +2279,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2369,21 +2368,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2413,20 +2412,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2436,20 +2435,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2528,21 +2527,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB15_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2572,20 +2571,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2595,20 +2594,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2690,13 +2689,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -2735,13 +2734,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2761,13 +2760,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2855,21 +2854,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX942-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX942-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2899,22 +2898,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2924,20 +2923,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3011,15 +3010,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3049,15 +3048,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3095,15 +3094,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3121,15 +3120,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3184,15 +3183,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3222,15 +3221,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3268,15 +3267,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3296,15 +3295,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3358,15 +3357,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3396,15 +3395,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3442,15 +3441,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3470,15 +3469,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3531,21 +3530,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3568,22 +3567,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3613,21 +3612,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3637,21 +3636,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3694,21 +3693,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3731,22 +3730,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3776,21 +3775,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3802,21 +3801,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3860,21 +3859,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3897,22 +3896,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3942,21 +3941,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3968,21 +3967,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4027,15 +4026,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4065,15 +4064,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4093,15 +4092,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4122,13 +4121,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -4146,15 +4145,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4172,15 +4171,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4198,26 +4197,28 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] +; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 ; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4232,13 +4233,14 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4246,13 +4248,14 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v10, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] +; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 ; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4276,15 +4279,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4314,15 +4317,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4360,15 +4363,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4386,15 +4389,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4453,9 +4456,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4467,12 +4469,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -4500,9 +4503,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4514,11 +4516,12 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4551,14 +4554,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -4578,9 +4581,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4592,12 +4594,13 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 @@ -4620,9 +4623,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off @@ -4634,11 +4636,12 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4662,7 +4665,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -4674,9 +4676,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4705,14 +4708,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4739,14 +4742,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -4773,17 +4776,17 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4894,35 +4897,33 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4933,7 +4934,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4947,25 +4948,24 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5000,14 +5000,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -5026,35 +5026,33 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5065,19 +5063,18 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5087,11 +5084,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5116,10 +5114,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5128,9 +5125,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5160,14 +5158,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -5195,14 +5193,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5230,17 +5228,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5355,35 +5353,33 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5394,7 +5390,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5408,25 +5404,24 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5462,14 +5457,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -5488,35 +5483,33 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5527,19 +5520,18 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5549,11 +5541,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5578,10 +5571,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5590,9 +5582,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5622,14 +5615,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -5657,14 +5650,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5692,17 +5685,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5818,9 +5811,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5832,10 +5824,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -5863,9 +5856,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5877,9 +5869,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5913,13 +5906,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX942-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v7 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -5939,9 +5932,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5953,10 +5945,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 @@ -5979,9 +5972,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off @@ -5993,9 +5985,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -6020,7 +6013,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6032,8 +6024,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6062,13 +6055,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc @@ -6095,13 +6088,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc @@ -6128,16 +6121,16 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6245,37 +6238,35 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6293,37 +6284,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6341,29 +6331,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB30_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6373,38 +6363,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6416,37 +6404,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6458,31 +6446,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -6493,31 +6481,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6527,31 +6515,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6561,32 +6549,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6691,37 +6679,35 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6739,37 +6725,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6788,29 +6773,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB31_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6820,38 +6805,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6863,37 +6846,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6905,31 +6888,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -6940,31 +6923,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6974,31 +6957,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7008,32 +6991,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7139,16 +7122,16 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -7175,15 +7158,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7208,14 +7191,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX942-NEXT: v_min_f16_e32 v3, v3, v4 ; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 @@ -7234,16 +7217,16 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7265,15 +7248,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7296,14 +7279,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7325,14 +7308,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX90A-NEXT: v_min_f16_e32 v3, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7351,14 +7334,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v5 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -7379,19 +7362,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -7482,15 +7465,15 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7515,24 +7498,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v4, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7546,23 +7529,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX942-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX942-NEXT: v_min_f16_e32 v3, v4, v3 +; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB33_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7573,15 +7556,15 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7601,25 +7584,25 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v4, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7631,23 +7614,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -7658,22 +7641,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v3, v4, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7683,22 +7666,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7710,22 +7693,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7812,36 +7795,34 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v5.l, v2.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7852,7 +7833,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7866,25 +7847,24 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7920,14 +7900,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX942-NEXT: v_not_b32_e32 v4, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX942-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX942-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX942-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX942-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 @@ -7946,35 +7926,33 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7985,19 +7963,18 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8007,11 +7984,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8036,10 +8014,9 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8048,9 +8025,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8080,14 +8058,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -8117,14 +8095,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -8152,17 +8130,17 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -8277,38 +8255,36 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v3.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8326,38 +8302,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8375,29 +8350,29 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_load_dword v3, v[0:1], off -; GFX942-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX942-NEXT: global_load_dword v5, v[0:1], off +; GFX942-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX942-NEXT: s_mov_b32 s0, 0xffff -; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX942-NEXT: v_not_b32_e32 v5, v5 +; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX942-NEXT: v_not_b32_e32 v6, v4 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX942-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX942-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX942-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX942-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX942-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX942-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB35_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8407,38 +8382,36 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8450,37 +8423,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8492,31 +8465,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -8527,33 +8500,33 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8563,31 +8536,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8597,32 +8570,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13627,15 +13600,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -13657,14 +13630,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -13682,15 +13655,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13710,14 +13683,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13737,13 +13710,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -13761,13 +13734,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13785,21 +13758,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -13920,15 +13893,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -13950,14 +13923,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -13975,15 +13948,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -14003,14 +13976,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -14030,13 +14003,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14054,13 +14027,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14080,21 +14053,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -14215,15 +14188,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -14245,14 +14218,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -14270,15 +14243,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -14298,14 +14271,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -14325,13 +14298,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -14349,13 +14322,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -14375,21 +14348,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -14513,21 +14486,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14541,22 +14514,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB49_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14566,22 +14539,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14593,21 +14566,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -14618,20 +14591,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14641,20 +14614,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14664,24 +14637,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14795,21 +14768,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14823,22 +14796,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14848,22 +14821,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14875,21 +14848,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -14900,20 +14873,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14923,20 +14896,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14948,24 +14921,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15080,21 +15053,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15108,22 +15081,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15133,22 +15106,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15160,21 +15133,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15185,20 +15158,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15208,20 +15181,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15233,24 +15206,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15374,15 +15347,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -15405,14 +15378,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -15430,15 +15403,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -15458,14 +15431,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -15485,13 +15458,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -15511,13 +15484,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15537,21 +15510,21 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15671,22 +15644,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15700,22 +15673,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX942-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15725,22 +15698,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15752,21 +15725,21 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -15777,22 +15750,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15802,20 +15775,20 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15827,24 +15800,24 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15964,41 +15937,40 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16018,39 +15990,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16067,35 +16038,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 @@ -16108,42 +16079,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16158,41 +16128,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16207,34 +16175,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -16248,33 +16216,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -16288,33 +16256,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -16328,34 +16296,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -16468,41 +16436,40 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16522,39 +16489,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16571,35 +16537,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 @@ -16612,42 +16578,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16662,41 +16627,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16711,34 +16674,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -16752,33 +16715,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -16792,33 +16755,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -16834,34 +16797,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -16974,41 +16937,40 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17028,39 +16990,38 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17077,35 +17038,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 @@ -17118,42 +17079,41 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17168,41 +17128,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17217,34 +17175,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -17258,33 +17216,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -17298,33 +17256,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -17340,34 +17298,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -17483,41 +17441,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17535,40 +17493,40 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17582,38 +17540,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB57_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17623,42 +17581,43 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17671,41 +17630,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17718,35 +17677,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -17757,36 +17716,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17796,36 +17755,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17835,37 +17794,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17971,41 +17930,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18023,40 +17982,40 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18070,38 +18029,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB58_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18111,42 +18070,43 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18159,41 +18119,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18206,35 +18166,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -18245,36 +18205,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18284,36 +18244,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18325,37 +18285,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18462,41 +18422,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18514,40 +18474,40 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18561,38 +18521,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB59_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18602,42 +18562,43 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18650,41 +18611,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18697,35 +18658,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -18736,36 +18697,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18775,36 +18736,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18816,37 +18777,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18962,42 +18923,41 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -19017,40 +18977,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -19067,35 +19026,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB60_1 @@ -19108,42 +19067,41 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19158,41 +19116,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19207,34 +19163,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -19248,35 +19204,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -19290,33 +19246,33 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -19332,34 +19288,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -19471,42 +19427,42 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -19524,41 +19480,41 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -19572,38 +19528,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB61_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19613,42 +19569,43 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19661,41 +19618,41 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19708,35 +19665,35 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -19747,38 +19704,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19788,36 +19745,36 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19829,37 +19786,37 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index b8762d13e1327..edaaabbd51e67 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -16325,41 +16325,40 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16379,39 +16378,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16428,35 +16426,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB50_1 @@ -16469,42 +16467,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16519,41 +16516,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16568,34 +16563,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -16609,33 +16604,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 @@ -16649,33 +16644,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 @@ -16689,34 +16684,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 @@ -16829,41 +16824,40 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16883,39 +16877,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -16932,35 +16925,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB51_1 @@ -16973,42 +16966,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17023,41 +17015,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17072,34 +17062,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -17113,33 +17103,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 @@ -17153,33 +17143,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 @@ -17195,34 +17185,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 @@ -17335,41 +17325,40 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17389,39 +17378,38 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17438,35 +17426,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB52_1 @@ -17479,42 +17467,41 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17529,41 +17516,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17578,34 +17563,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -17619,33 +17604,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 @@ -17659,33 +17644,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB52_1 @@ -17701,34 +17686,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -17844,41 +17829,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -17896,40 +17881,40 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -17943,38 +17928,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB53_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17984,42 +17969,43 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18032,41 +18018,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18079,35 +18065,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -18118,36 +18104,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18157,36 +18143,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18196,37 +18182,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18332,41 +18318,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18384,40 +18370,40 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18431,38 +18417,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB54_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18472,42 +18458,43 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18520,41 +18507,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18567,35 +18554,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -18606,36 +18593,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18645,36 +18632,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18686,37 +18673,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18823,41 +18810,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -18875,40 +18862,40 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -18922,38 +18909,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB55_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18963,42 +18950,43 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19011,41 +18999,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19058,35 +19046,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -19097,36 +19085,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19136,36 +19124,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19177,37 +19165,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19323,42 +19311,41 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -19378,40 +19365,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -19428,35 +19414,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v3 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB56_1 @@ -19469,42 +19455,41 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -19519,41 +19504,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19568,34 +19551,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -19609,35 +19592,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -19651,33 +19634,33 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -19693,34 +19676,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -19832,42 +19815,42 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -19885,41 +19868,41 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -19933,38 +19916,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX942-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX942-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX942-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB57_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19974,42 +19957,43 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -20022,41 +20006,41 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-FAKE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20069,35 +20053,35 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -20108,38 +20092,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20149,36 +20133,36 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20190,37 +20174,37 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index 6fe9e1d5561de..4ff0462c6e41b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -56,19 +56,19 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1 ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_movk_i32 s2, 0x100 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_movk_i32 s1, 0x100 ; GCN-NEXT: .LBB1_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_lshlrev_b64 v[3:4], 1, v[0:1] -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc -; GCN-NEXT: global_load_short_d16_hi v0, v[3:4], off glc +; GCN-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc +; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 ; GCN-NEXT: s_cbranch_vccz .LBB1_1 ; GCN-NEXT: ; %bb.2: ; %bb2 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index ffab56847edca..6e715a920a8d3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -1632,18 +1632,20 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB34_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB34_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1722,22 +1724,24 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB35_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB35_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar: @@ -1812,18 +1816,18 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB36_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB36_1 @@ -1901,24 +1905,26 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB37_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB37_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar: @@ -2417,18 +2423,20 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB45_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB45_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2507,22 +2515,24 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB46_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB46_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar: @@ -2597,18 +2607,18 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB47_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_and_b32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB47_1 @@ -2686,24 +2696,26 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB48_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_and_b32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB48_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar: @@ -3178,19 +3190,21 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s6, v3 -; VI-NEXT: v_not_b32_e32 v2, v2 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3271,23 +3285,25 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s6, v3 -; VI-NEXT: v_not_b32_e32 v2, v2 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar: @@ -3364,19 +3380,19 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_and_b32_e32 v0, s6, v4 -; VI-NEXT: v_not_b32_e32 v3, v0 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB57_1 @@ -3456,25 +3472,27 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB58_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_and_b32_e32 v0, s6, v4 -; VI-NEXT: v_not_b32_e32 v3, v0 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar: @@ -4028,18 +4046,20 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB65_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_or_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB65_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4118,22 +4138,24 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB66_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_or_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB66_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar: @@ -4208,18 +4230,18 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB67_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_or_b32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_or_b32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB67_1 @@ -4297,24 +4319,26 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB68_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_or_b32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_or_b32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB68_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar: @@ -4813,18 +4837,20 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB76_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_xor_b32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB76_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4903,22 +4929,24 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB77_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_xor_b32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB77_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar: @@ -4993,18 +5021,18 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB78_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_xor_b32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB78_1 @@ -5082,24 +5110,26 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB79_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_xor_b32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB79_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar: @@ -5598,18 +5628,20 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5688,22 +5720,24 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar: @@ -5778,18 +5812,18 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_max_i32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB89_1 @@ -5867,24 +5901,26 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_max_i32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar: @@ -5953,26 +5989,26 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s3, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_i32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_max_i32_e32 v0, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -6059,32 +6095,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: @@ -6166,24 +6202,24 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s3, s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB93_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_i32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_max_i32_e32 v0, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -6269,30 +6305,30 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB94_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: @@ -6766,18 +6802,20 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6856,22 +6894,24 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar: @@ -6946,18 +6986,18 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_max_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB103_1 @@ -7035,24 +7075,26 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_max_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar: @@ -7121,26 +7163,26 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s3, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB105_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_u32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_max_u32_e32 v0, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -7227,32 +7269,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB106_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -7342,30 +7384,30 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: @@ -7839,18 +7881,20 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_min_u32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7929,22 +7973,24 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB115_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_min_u32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar: @@ -8019,18 +8065,18 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB116_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_min_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB116_1 @@ -8108,24 +8154,26 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB117_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_min_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: @@ -8587,18 +8635,20 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8677,22 +8727,24 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar: @@ -8767,18 +8819,18 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_i32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB126_1 @@ -8856,24 +8908,26 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_i32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar: @@ -8942,26 +8996,26 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s3, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_min_i32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_min_i32_e32 v0, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -9048,32 +9102,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB129_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_min_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: @@ -9146,25 +9200,25 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VI-NEXT: s_load_dword s2, s[4:5], 0x2c -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s3, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_load_dword s5, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB130_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_min_i32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_min_i32_e32 v0, s4, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; VI-NEXT: s_cbranch_execnz .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -9245,30 +9299,30 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB131_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_min_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB131_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: @@ -9768,20 +9822,22 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB138_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB138_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9864,24 +9920,26 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB139_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB139_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -9960,20 +10018,20 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB140_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v4 -; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 -; VI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB140_1 @@ -10055,26 +10113,28 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB141_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v4 -; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 -; VI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB141_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -10546,42 +10606,42 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v5, s6, 0 -; SI-NEXT: v_writelane_b32 v5, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[38:39], 0 -; SI-NEXT: v_mov_b32_e32 v2, s34 ; SI-NEXT: .LBB148_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s34 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; SI-NEXT: s_cbranch_execnz .LBB148_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[38:39] -; SI-NEXT: v_readlane_b32 s7, v5, 1 -; SI-NEXT: v_readlane_b32 s6, v5, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10591,23 +10651,25 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[36:37], 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: .LBB148_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB148_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10620,13 +10682,13 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc @@ -10649,42 +10711,42 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v5, s6, 0 -; SI-NEXT: v_writelane_b32 v5, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[38:39], 0 -; SI-NEXT: v_mov_b32_e32 v2, s34 ; SI-NEXT: .LBB149_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s34 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; SI-NEXT: s_cbranch_execnz .LBB149_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[38:39] -; SI-NEXT: v_readlane_b32 s7, v5, 1 -; SI-NEXT: v_readlane_b32 s6, v5, 0 +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10692,31 +10754,33 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 16 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[36:37], 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: s_add_u32 s36, s4, 16 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 ; VI-NEXT: .LBB149_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s36 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s37 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; VI-NEXT: s_cbranch_execnz .LBB149_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: @@ -10725,13 +10789,13 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc @@ -10755,42 +10819,42 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v5, s6, 0 -; SI-NEXT: v_writelane_b32 v5, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[38:39], 0 -; SI-NEXT: v_mov_b32_e32 v2, s34 ; SI-NEXT: .LBB150_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v2 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; SI-NEXT: s_cbranch_execnz .LBB150_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[38:39] -; SI-NEXT: v_readlane_b32 s7, v5, 1 -; SI-NEXT: v_readlane_b32 s6, v5, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10801,23 +10865,23 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[36:37], 0 -; VI-NEXT: v_mov_b32_e32 v3, s6 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB150_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] -; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB150_1 @@ -10831,20 +10895,20 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: .LBB150_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v3 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] glc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_cbranch_execnz .LBB150_1 @@ -10860,42 +10924,42 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v5, s6, 0 -; SI-NEXT: v_writelane_b32 v5, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[38:39], 0 -; SI-NEXT: v_mov_b32_e32 v2, s34 ; SI-NEXT: .LBB151_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v2 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc -; SI-NEXT: v_mov_b32_e32 v0, v3 -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; SI-NEXT: s_cbranch_execnz .LBB151_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[38:39] -; SI-NEXT: v_readlane_b32 s7, v5, 1 -; SI-NEXT: v_readlane_b32 s6, v5, 0 +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10903,31 +10967,33 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 16 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[36:37], 0 -; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: s_add_u32 s36, s4, 16 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 ; VI-NEXT: .LBB151_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s36 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] -; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc +; VI-NEXT: v_mov_b32_e32 v3, s37 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; VI-NEXT: s_cbranch_execnz .LBB151_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[36:37] +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: @@ -10936,20 +11002,20 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: .LBB151_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v3 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] offset:16 glc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_cbranch_execnz .LBB151_1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 74f0f64c935b4..43caa77557636 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -1678,43 +1678,43 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v9, s6, 0 -; SI-NEXT: v_writelane_b32 v9, s7, 1 -; SI-NEXT: s_mov_b32 s35, s7 -; SI-NEXT: s_mov_b32 s34, s6 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: .LBB34_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2 -; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s35, v2 +; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v3 -; SI-NEXT: v_mov_b32_e32 v7, v2 -; SI-NEXT: v_mov_b32_e32 v6, v1 -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v5 -; SI-NEXT: v_mov_b32_e32 v3, v6 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB34_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v9, 1 -; SI-NEXT: v_readlane_b32 s6, v9, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1725,15 +1725,15 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB34_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 -; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1753,12 +1753,12 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1780,43 +1780,43 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v9, s6, 0 -; SI-NEXT: v_writelane_b32 v9, s7, 1 -; SI-NEXT: s_mov_b32 s35, s7 -; SI-NEXT: s_mov_b32 s34, s6 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: .LBB35_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v1, s34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2 -; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s35, v2 +; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v3 -; SI-NEXT: v_mov_b32_e32 v7, v2 -; SI-NEXT: v_mov_b32_e32 v6, v1 -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v5 -; SI-NEXT: v_mov_b32_e32 v3, v6 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB35_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v9, 1 -; SI-NEXT: v_readlane_b32 s6, v9, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1826,27 +1826,29 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB35_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 -; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB35_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar: @@ -1855,12 +1857,12 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1883,43 +1885,43 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v9, s6, 0 -; SI-NEXT: v_writelane_b32 v9, s7, 1 -; SI-NEXT: s_mov_b32 s35, s7 -; SI-NEXT: s_mov_b32 s34, s6 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: .LBB36_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v1 -; SI-NEXT: v_mov_b32_e32 v7, v0 -; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 -; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v5 -; SI-NEXT: v_mov_b32_e32 v1, v6 -; SI-NEXT: v_mov_b32_e32 v2, v7 -; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_subrev_i32_e32 v2, vcc, s35, v4 +; SI-NEXT: v_subb_u32_e32 v3, vcc, v5, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB36_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v9, 1 -; SI-NEXT: v_readlane_b32 s6, v9, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1930,21 +1932,21 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB36_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, v1 -; VI-NEXT: v_mov_b32_e32 v7, v0 -; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 -; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB36_1 @@ -1958,18 +1960,18 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB36_1 @@ -1985,43 +1987,43 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v9, s6, 0 -; SI-NEXT: v_writelane_b32 v9, s7, 1 -; SI-NEXT: s_mov_b32 s35, s7 -; SI-NEXT: s_mov_b32 s34, s6 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: .LBB37_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v1 -; SI-NEXT: v_mov_b32_e32 v7, v0 -; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 -; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v5 -; SI-NEXT: v_mov_b32_e32 v1, v6 -; SI-NEXT: v_mov_b32_e32 v2, v7 -; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_subrev_i32_e32 v2, vcc, s35, v4 +; SI-NEXT: v_subb_u32_e32 v3, vcc, v5, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB37_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v9, 1 -; SI-NEXT: v_readlane_b32 s6, v9, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2031,27 +2033,29 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB37_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, v1 -; VI-NEXT: v_mov_b32_e32 v7, v0 -; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 -; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB37_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar: @@ -2060,18 +2064,18 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB37_1 @@ -2564,14 +2568,14 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB44_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, s7, v3 ; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -2662,26 +2666,28 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB45_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, s7, v3 ; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB45_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar: @@ -2763,20 +2769,20 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB46_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_and_b32_e32 v5, s7, v7 -; VI-NEXT: v_and_b32_e32 v4, s6, v6 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB46_1 @@ -2861,26 +2867,28 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB47_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_and_b32_e32 v5, s7, v7 -; VI-NEXT: v_and_b32_e32 v4, s6, v6 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB47_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar: @@ -3418,14 +3426,14 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 ; VI-NEXT: v_and_b32_e32 v6, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_not_b32_e32 v1, v0 ; VI-NEXT: v_not_b32_e32 v0, v6 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -3522,15 +3530,17 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 ; VI-NEXT: v_and_b32_e32 v6, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: v_not_b32_e32 v1, v0 ; VI-NEXT: v_not_b32_e32 v0, v6 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -3538,12 +3548,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar: @@ -3629,22 +3639,22 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_and_b32_e32 v0, s7, v7 -; VI-NEXT: v_and_b32_e32 v1, s6, v6 -; VI-NEXT: v_not_b32_e32 v5, v0 -; VI-NEXT: v_not_b32_e32 v4, v1 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_and_b32_e32 v0, s7, v3 +; VI-NEXT: v_and_b32_e32 v6, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_not_b32_e32 v1, v0 +; VI-NEXT: v_not_b32_e32 v0, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB56_1 @@ -3733,28 +3743,30 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_and_b32_e32 v0, s7, v7 -; VI-NEXT: v_and_b32_e32 v1, s6, v6 -; VI-NEXT: v_not_b32_e32 v5, v0 -; VI-NEXT: v_not_b32_e32 v4, v1 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_and_b32_e32 v0, s7, v3 +; VI-NEXT: v_and_b32_e32 v6, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_not_b32_e32 v1, v0 +; VI-NEXT: v_not_b32_e32 v0, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar: @@ -4384,14 +4396,14 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB64_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v1, s7, v3 ; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4482,26 +4494,28 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB65_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v1, s7, v3 ; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB65_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar: @@ -4583,20 +4597,20 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB66_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_or_b32_e32 v5, s7, v7 -; VI-NEXT: v_or_b32_e32 v4, s6, v6 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB66_1 @@ -4681,26 +4695,28 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB67_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_or_b32_e32 v5, s7, v7 -; VI-NEXT: v_or_b32_e32 v4, s6, v6 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB67_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar: @@ -5212,14 +5228,14 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB74_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v1, s7, v3 ; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -5310,26 +5326,28 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB75_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v1, s7, v3 ; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB75_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar: @@ -5411,20 +5429,20 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB76_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_xor_b32_e32 v5, s7, v7 -; VI-NEXT: v_xor_b32_e32 v4, s6, v6 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB76_1 @@ -5509,26 +5527,28 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB77_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_xor_b32_e32 v5, s7, v7 -; VI-NEXT: v_xor_b32_e32 v4, s6, v6 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB77_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar: @@ -6006,45 +6026,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6055,17 +6075,17 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6085,14 +6105,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6114,45 +6134,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6162,29 +6182,31 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar: @@ -6193,14 +6215,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6223,45 +6245,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6272,23 +6294,23 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB86_1 @@ -6302,20 +6324,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB86_1 @@ -6331,45 +6353,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6379,29 +6401,31 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar: @@ -6410,20 +6434,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB87_1 @@ -6447,29 +6471,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6479,26 +6503,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6515,24 +6539,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6561,17 +6585,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -6598,68 +6622,68 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 @@ -6688,29 +6712,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6722,30 +6746,30 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v6, s3 -; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -6754,24 +6778,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6799,17 +6823,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -6838,64 +6862,64 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 @@ -7358,45 +7382,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7407,17 +7431,17 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -7437,14 +7461,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7466,45 +7490,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7514,29 +7538,31 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar: @@ -7545,14 +7571,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7575,45 +7601,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7624,23 +7650,23 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB100_1 @@ -7654,20 +7680,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB100_1 @@ -7683,45 +7709,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7731,29 +7757,31 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar: @@ -7762,20 +7790,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB101_1 @@ -7799,29 +7827,29 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7831,26 +7859,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -7867,24 +7895,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -7913,17 +7941,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -7950,68 +7978,68 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 @@ -8039,17 +8067,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -8078,64 +8106,64 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB104_1 @@ -8598,45 +8626,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8647,17 +8675,17 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8677,14 +8705,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8706,45 +8734,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8754,29 +8782,31 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar: @@ -8785,14 +8815,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8815,45 +8845,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8864,23 +8894,23 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB113_1 @@ -8894,20 +8924,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB113_1 @@ -8923,45 +8953,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8971,29 +9001,31 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar: @@ -9002,20 +9034,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB114_1 @@ -9474,45 +9506,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9523,17 +9555,17 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -9553,14 +9585,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -9582,45 +9614,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9630,29 +9662,31 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar: @@ -9661,14 +9695,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -9691,45 +9725,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9740,23 +9774,23 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB123_1 @@ -9770,20 +9804,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB123_1 @@ -9799,45 +9833,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9847,29 +9881,31 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar: @@ -9878,20 +9914,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB124_1 @@ -9915,29 +9951,29 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9947,26 +9983,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -9983,24 +10019,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -10029,17 +10065,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -10066,68 +10102,68 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 @@ -10149,35 +10185,35 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_mov_b32_e32 v3, s5 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm @@ -10188,18 +10224,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s3 -; VI-NEXT: v_mov_b32_e32 v7, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -10216,20 +10252,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -10256,17 +10292,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -10295,64 +10331,64 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 @@ -10888,15 +10924,15 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB135_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -10995,16 +11031,18 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB136_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -11012,12 +11050,12 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB136_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar: @@ -11105,23 +11143,23 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB137_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; VI-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB137_1 @@ -11212,29 +11250,31 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB138_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; VI-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB138_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar: @@ -11763,19 +11803,17 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[38:39], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB145_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -11783,29 +11821,31 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v4, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; SI-NEXT: s_cbranch_execnz .LBB145_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[38:39] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -11816,19 +11856,19 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_mov_b64 s[38:39], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB145_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 -; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -11836,12 +11876,12 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB145_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[38:39] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar: @@ -11849,16 +11889,16 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] -; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc @@ -11867,12 +11907,12 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_cbranch_execnz .LBB145_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -11883,19 +11923,17 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[38:39], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB146_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -11903,29 +11941,31 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[2:3] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v4, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; SI-NEXT: s_cbranch_execnz .LBB146_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[38:39] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -11933,22 +11973,24 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 32 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_add_u32 s36, s4, 32 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_mov_b64 s[38:39], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: .LBB146_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 -; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s36 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s37 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -11969,16 +12011,16 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 ; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc @@ -11987,12 +12029,12 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_cbranch_execnz .LBB146_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -12004,49 +12046,49 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[38:39], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB147_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v8 -; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc -; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9] +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v5, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[4:5] +; SI-NEXT: v_mov_b32_e32 v2, s35 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] -; SI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; SI-NEXT: s_cbranch_execnz .LBB147_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[38:39] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -12057,27 +12099,27 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[38:39], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB147_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 -; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] -; VI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; VI-NEXT: s_cbranch_execnz .LBB147_1 @@ -12091,24 +12133,24 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[7:8] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v8, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[5:6] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[36:37], -1, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[36:37], -1, v6, s[36:37] ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v4, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB147_1 @@ -12124,49 +12166,49 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[38:39], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB148_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v8 -; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc -; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9] +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v5, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[4:5] +; SI-NEXT: v_mov_b32_e32 v2, s35 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] -; SI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] ; SI-NEXT: s_cbranch_execnz .LBB148_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[38:39] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -12174,35 +12216,37 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 32 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[38:39], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: s_add_u32 s38, s4, 32 +; VI-NEXT: s_addc_u32 s39, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s38 +; VI-NEXT: v_mov_b32_e32 v1, s39 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[40:41], 0 ; VI-NEXT: .LBB148_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 -; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s38 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] -; VI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v5, s39 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; VI-NEXT: s_andn2_b64 exec, exec, s[40:41] ; VI-NEXT: s_cbranch_execnz .LBB148_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[38:39] +; VI-NEXT: s_or_b64 exec, exec, s[40:41] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: @@ -12211,24 +12255,24 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[7:8] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v8, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[5:6] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[36:37], -1, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[36:37], -1, v6, s[36:37] ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v4, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB148_1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 6351bb39e97f5..bdd19a3bfbcab 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -328,13 +328,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -400,14 +399,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -758,14 +757,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1367,13 +1366,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -1439,14 +1437,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1797,14 +1795,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2406,13 +2404,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -2478,14 +2475,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2836,14 +2833,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -3528,13 +3525,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[6:7], v[0:1] ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -3604,15 +3600,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3789,16 +3785,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3858,16 +3854,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -3911,25 +3906,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4283,17 +4277,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -4375,15 +4368,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -4796,13 +4789,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[6:7], v[0:1] ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -4872,15 +4864,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5057,16 +5049,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5126,16 +5118,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -5179,25 +5170,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5551,17 +5541,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -5643,15 +5632,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -6064,13 +6053,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[6:7], v[0:1] ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -6140,15 +6128,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6325,16 +6313,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6394,16 +6382,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6447,25 +6434,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6819,17 +6805,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -6911,15 +6896,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index a9ac00863cd17..ea85bbb72b103 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -328,13 +328,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -400,14 +399,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -758,14 +757,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1367,13 +1366,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -1439,14 +1437,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1797,14 +1795,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2406,13 +2404,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -2478,14 +2475,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2836,14 +2833,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -3528,13 +3525,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1] ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -3604,15 +3600,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3789,16 +3785,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3858,16 +3854,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -3911,25 +3906,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4283,17 +4277,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -4375,15 +4368,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -4796,13 +4789,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1] ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -4872,15 +4864,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5057,16 +5049,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5126,16 +5118,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -5179,25 +5170,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5551,17 +5541,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -5643,15 +5632,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -6064,13 +6053,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1] ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -6140,15 +6128,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6325,16 +6313,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6394,16 +6382,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6447,25 +6434,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6819,17 +6805,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -6911,15 +6896,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 59dfd713ef4fd..78b6cac98cd67 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -109,51 +109,49 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_ashr_i32 s7, s6, 31 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CHECK-NEXT: s_add_u32 s0, s0, s4 -; CHECK-NEXT: s_addc_u32 s1, s1, s5 -; CHECK-NEXT: s_add_u32 s4, s0, -8 -; CHECK-NEXT: s_addc_u32 s5, s1, -1 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: s_add_u32 s4, s0, s4 +; CHECK-NEXT: s_addc_u32 s5, s1, s5 +; CHECK-NEXT: s_add_u32 s0, s4, -8 +; CHECK-NEXT: s_addc_u32 s1, s5, -1 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 9 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 ; CHECK-NEXT: .LBB3_1: ; %bb0 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb1 -; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base -; CHECK-NEXT: s_cmp_eq_u32 s5, s1 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; CHECK-NEXT: s_mov_b64 s[0:1], -1 +; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base +; CHECK-NEXT: s_cmp_eq_u32 s1, s5 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_mov_b64 s[4:5], -1 ; CHECK-NEXT: s_cbranch_vccnz .LBB3_5 ; CHECK-NEXT: ; %bb.3: ; %Flow6 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccz .LBB3_10 ; CHECK-NEXT: .LBB3_4: ; %atomicrmw.phi ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB3_5: ; %atomicrmw.check.private -; CHECK-NEXT: s_mov_b64 s[0:1], src_private_base -; CHECK-NEXT: s_cmp_eq_u32 s5, s1 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; CHECK-NEXT: s_mov_b64 s[0:1], -1 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: s_cmp_eq_u32 s1, s5 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_mov_b64 s[4:5], -1 ; CHECK-NEXT: s_cbranch_vccz .LBB3_7 ; CHECK-NEXT: ; %bb.6: ; %atomicrmw.global ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; CHECK-NEXT: global_atomic_add_f64 v0, v[2:3], s[4:5] +; CHECK-NEXT: global_atomic_add_f64 v0, v[2:3], s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB3_7: ; %Flow -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB3_9 ; CHECK-NEXT: ; %bb.8: ; %atomicrmw.private -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 -; CHECK-NEXT: s_cselect_b32 s0, s4, -1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b32 s4, s0, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; CHECK-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -163,8 +161,8 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: .LBB3_9: ; %Flow5 ; CHECK-NEXT: s_cbranch_execnz .LBB3_4 ; CHECK-NEXT: .LBB3_10: ; %atomicrmw.shared -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 -; CHECK-NEXT: s_cselect_b32 s0, s4, -1 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b32 s0, s0, -1 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; CHECK-NEXT: ds_add_f64 v0, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index eb5c5ef15ed56..2a178d565c29c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -55,18 +55,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] ; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24 +; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24 ; GFX11-NEXT: s_mov_b32 s12, s13 ; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s20, 0 +; GFX11-NEXT: s_mov_b32 s19, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 @@ -75,7 +75,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s18, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-NEXT: s_cselect_b32 s24, -1, 0 +; GFX11-NEXT: s_cselect_b32 s19, -1, 0 ; GFX11-NEXT: s_bitcmp0_b32 s21, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX11-NEXT: ; %bb.2: ; %bb15 @@ -111,58 +111,58 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_hi_u32 s0, s29, s28 ; GFX11-NEXT: s_mul_i32 s1, s29, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 ; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_or_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s0, s0, s30 -; GFX11-NEXT: s_mul_i32 s0, s0, s22 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s0, s0, s22 ; GFX11-NEXT: s_mul_i32 s0, s0, s20 -; GFX11-NEXT: s_or_b32 s0, s19, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s24, s0 ; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 -; GFX11-NEXT: s_mov_b32 s0, s1 -; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 +; GFX11-NEXT: global_load_u16 v0, v0, s[20:21] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s19 ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s20, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX11-NEXT: s_and_b32 s1, s8, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s13 -; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-NEXT: s_cselect_b32 s1, s21, s20 +; GFX11-NEXT: s_and_b32 s20, 0xffff, s13 ; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-NEXT: s_and_b32 s20, s9, exec_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b32 s20, -1, 0 +; GFX11-NEXT: s_and_b32 s22, s9, exec_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s20 +; GFX11-NEXT: v_readfirstlane_b32 s20, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s13, s19, s13 -; GFX11-NEXT: s_bitcmp1_b32 s13, 0 -; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0 +; GFX11-NEXT: v_readfirstlane_b32 s21, v0 +; GFX11-NEXT: s_cselect_b32 s20, s21, s20 +; GFX11-NEXT: s_bitcmp1_b32 s20, 0 +; GFX11-NEXT: s_cselect_b32 s20, 0x100, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s13, s0 +; GFX11-NEXT: s_or_b32 s13, s20, s13 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow ; GFX11-NEXT: s_mov_b32 s0, 0 @@ -180,7 +180,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: ; %bb.11: ; %Flow6 ; GFX11-NEXT: s_mov_b32 s18, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s20, s2, exec_lo +; GFX11-NEXT: s_and_b32 s19, s2, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -197,10 +197,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s20, s20, exec_lo +; GFX11-NEXT: s_or_b32 s19, s19, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_and_saveexec_b32 s0, s20 +; GFX11-NEXT: s_and_saveexec_b32 s0, s19 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable ; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll index 300124848c1aa..3df22eeaa88c2 100644 --- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll @@ -5,58 +5,58 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-LABEL: issue130120: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_movk_i32 s0, 0xf4 +; CHECK-NEXT: s_movk_i32 s1, 0xf8 +; CHECK-NEXT: s_add_i32 s34, s32, s0 +; CHECK-NEXT: s_add_i32 s35, s32, s1 +; CHECK-NEXT: s_movk_i32 s0, 0xfc +; CHECK-NEXT: s_movk_i32 s1, 0x100 +; CHECK-NEXT: s_add_i32 s36, s32, s0 +; CHECK-NEXT: s_add_i32 s37, s32, s1 +; CHECK-NEXT: s_movk_i32 s0, 0x104 +; CHECK-NEXT: s_movk_i32 s1, 0x108 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_movk_i32 s1, 0xf4 -; CHECK-NEXT: s_movk_i32 s2, 0xf8 -; CHECK-NEXT: s_movk_i32 s3, 0xfc -; CHECK-NEXT: s_movk_i32 s34, 0x100 -; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: s_movk_i32 s35, 0x104 -; CHECK-NEXT: s_movk_i32 s36, 0x108 -; CHECK-NEXT: s_movk_i32 s37, 0x110 -; CHECK-NEXT: s_movk_i32 s38, 0x120 -; CHECK-NEXT: s_add_i32 s0, s32, 0xf0 -; CHECK-NEXT: s_add_i32 s1, s32, s1 -; CHECK-NEXT: s_add_i32 s2, s32, s2 -; CHECK-NEXT: s_add_i32 s3, s32, s3 -; CHECK-NEXT: s_add_i32 s34, s32, s34 -; CHECK-NEXT: s_add_i32 s35, s32, s35 -; CHECK-NEXT: s_add_i32 s36, s32, s36 -; CHECK-NEXT: s_add_i32 s37, s32, s37 -; CHECK-NEXT: s_add_i32 s38, s32, s38 -; CHECK-NEXT: s_or_b32 s39, s32, 4 -; CHECK-NEXT: s_or_b32 s40, s32, 8 -; CHECK-NEXT: s_or_b32 s41, s32, 12 -; CHECK-NEXT: s_add_i32 s42, s32, 16 -; CHECK-NEXT: s_add_i32 s43, s32, 20 -; CHECK-NEXT: s_add_i32 s44, s32, 24 -; CHECK-NEXT: s_mov_b32 s46, 1 -; CHECK-NEXT: s_movk_i32 s45, 0x990 -; CHECK-NEXT: s_mov_b32 s48, 0 +; CHECK-NEXT: s_add_i32 s38, s32, s0 +; CHECK-NEXT: s_add_i32 s39, s32, s1 +; CHECK-NEXT: s_movk_i32 s0, 0x110 +; CHECK-NEXT: s_movk_i32 s1, 0x120 +; CHECK-NEXT: s_add_i32 s3, s32, 0xf0 +; CHECK-NEXT: s_add_i32 s40, s32, s0 +; CHECK-NEXT: s_add_i32 s41, s32, s1 +; CHECK-NEXT: s_or_b32 s42, s32, 4 +; CHECK-NEXT: s_or_b32 s43, s32, 8 +; CHECK-NEXT: s_or_b32 s44, s32, 12 +; CHECK-NEXT: s_add_i32 s45, s32, 16 +; CHECK-NEXT: s_add_i32 s46, s32, 20 +; CHECK-NEXT: s_add_i32 s47, s32, 24 +; CHECK-NEXT: s_mov_b32 s49, 1 +; CHECK-NEXT: s_movk_i32 s48, 0x990 +; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: .LBB0_1: ; %bb3 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_cmp_eq_u32 s46, 0 -; CHECK-NEXT: s_mov_b32 s49, s48 -; CHECK-NEXT: s_mov_b32 s50, s48 -; CHECK-NEXT: s_cselect_b32 s51, 0, s1 -; CHECK-NEXT: s_cselect_b32 s55, 0, s35 -; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 -; CHECK-NEXT: s_cselect_b32 s52, 0, s2 -; CHECK-NEXT: s_cselect_b32 s56, 0, s36 -; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43 -; CHECK-NEXT: v_mov_b32_e32 v4, s50 -; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0 -; CHECK-NEXT: s_cselect_b32 s53, 0, s3 -; CHECK-NEXT: s_cselect_b32 s54, 0, s34 -; CHECK-NEXT: s_cselect_b32 s57, 0, s37 -; CHECK-NEXT: s_cselect_b32 s58, 0, s38 -; CHECK-NEXT: s_cselect_b32 s59, 0, s0 -; CHECK-NEXT: s_cselect_b32 s60, 0, s39 -; CHECK-NEXT: s_cselect_b32 s61, 0, s40 -; CHECK-NEXT: s_cselect_b32 s62, 0, s41 -; CHECK-NEXT: s_cselect_b32 s63, 0, s42 -; CHECK-NEXT: s_cselect_b32 vcc_hi, 0, s44 -; CHECK-NEXT: s_mov_b32 s46, s48 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v4, s2 +; CHECK-NEXT: s_cmp_eq_u32 s49, 0 +; CHECK-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; CHECK-NEXT: s_cselect_b32 s51, 0, s34 +; CHECK-NEXT: s_cselect_b32 s55, 0, s38 +; CHECK-NEXT: s_cselect_b32 s52, 0, s35 +; CHECK-NEXT: s_cselect_b32 s56, 0, s39 +; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s46 +; CHECK-NEXT: s_cselect_b32 s50, s48, 0xf0 +; CHECK-NEXT: s_cselect_b32 s53, 0, s36 +; CHECK-NEXT: s_cselect_b32 s54, 0, s37 +; CHECK-NEXT: s_cselect_b32 s57, 0, s40 +; CHECK-NEXT: s_cselect_b32 s58, 0, s41 +; CHECK-NEXT: s_cselect_b32 s59, 0, s3 +; CHECK-NEXT: s_cselect_b32 s60, 0, s42 +; CHECK-NEXT: s_cselect_b32 s61, 0, s43 +; CHECK-NEXT: s_cselect_b32 s62, 0, s44 +; CHECK-NEXT: s_cselect_b32 s63, 0, s45 +; CHECK-NEXT: s_cselect_b32 vcc_hi, 0, s47 +; CHECK-NEXT: s_mov_b32 s49, s0 ; CHECK-NEXT: scratch_store_b32 off, v0, s51 ; CHECK-NEXT: scratch_store_b32 off, v0, s52 ; CHECK-NEXT: scratch_store_b32 off, v0, s53 @@ -64,7 +64,7 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-NEXT: scratch_store_b32 off, v0, s55 ; CHECK-NEXT: scratch_store_b64 off, v[0:1], s56 ; CHECK-NEXT: scratch_store_b32 off, v0, s57 -; CHECK-NEXT: scratch_store_b32 off, v0, s47 +; CHECK-NEXT: scratch_store_b32 off, v0, s50 ; CHECK-NEXT: scratch_store_b96 off, v[2:4], s58 ; CHECK-NEXT: scratch_store_b96 off, v[2:4], s59 ; CHECK-NEXT: scratch_store_b32 off, v0, s60 diff --git a/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll b/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll index f18a657b8082d..3fd3494675ab9 100644 --- a/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll +++ b/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll @@ -18,21 +18,21 @@ define amdgpu_kernel void @stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b9 ; GFX942-NEXT: s_mov_b32 s9, 0x45004400 ; GFX942-NEXT: s_mov_b32 s10, 0x42004000 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: .LBB0_2: ; %.lr.ph ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[4:7], v2, s[2:3] +; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3] ; GFX942-NEXT: s_add_u32 s4, s4, 8 ; GFX942-NEXT: s_addc_u32 s5, s5, 0 -; GFX942-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_pk_add_f16 v7, v7, s8 -; GFX942-NEXT: v_pk_add_f16 v6, v6, s9 -; GFX942-NEXT: v_pk_add_f16 v5, v5, s10 -; GFX942-NEXT: v_pk_add_f16 v4, v4, 1.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] +; GFX942-NEXT: v_pk_add_f16 v5, v5, s8 +; GFX942-NEXT: v_pk_add_f16 v4, v4, s9 +; GFX942-NEXT: v_pk_add_f16 v3, v3, s10 +; GFX942-NEXT: v_pk_add_f16 v2, v2, 1.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, 16 ; GFX942-NEXT: s_addc_u32 s1, s1, 0 ; GFX942-NEXT: s_add_u32 s2, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir index 98552de05c857..67cdf196a4693 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir @@ -32,46 +32,46 @@ body: | ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; GCN-NEXT: liveins: $vcc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc ; GCN-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY1]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_1]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_2]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_3:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_3]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_4:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_4]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_5]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_6]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_7]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_8]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_9]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_10]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_11]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_12]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_14]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_15:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_15]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_16:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; GCN-NEXT: liveins: $vcc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_1]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_2]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_3]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_4]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_5]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_6]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_7]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_8]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_9]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_13]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_14]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_15]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_16]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_16]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_17:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY17]], implicit $mode, implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_17]], implicit $exec ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index 88963643218a5..16ad293347d17 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -14,42 +14,42 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i ; GFX11-LABEL: struct_atomic_buffer_load_i32: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB0_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_atomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB0_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -117,42 +117,42 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3 ; GFX11-LABEL: struct_atomic_buffer_load_i32_off: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB2_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_atomic_buffer_load_i32_off: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB2_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -171,43 +171,43 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i ; GFX11-LABEL: struct_atomic_buffer_load_i32_soff: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB3_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_atomic_buffer_load_i32_soff: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_mov_b32 s5, 4 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_mov_b32 s6, 4 ; GFX12-NEXT: .LBB3_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], s5 idxen offset:4 th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], s6 idxen offset:4 th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -225,42 +225,42 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3 ; GFX11-LABEL: struct_atomic_buffer_load_i32_dlc: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB4_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_atomic_buffer_load_i32_dlc: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB4_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -334,43 +334,43 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; GFX11-LABEL: struct_atomic_buffer_load_i64: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB6_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_atomic_buffer_load_i64: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB6_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -390,42 +390,42 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 ; GFX11-LABEL: struct_atomic_buffer_load_v2i16: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB7_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_atomic_buffer_load_v2i16: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB7_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -445,23 +445,23 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX11-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb ; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm @@ -469,23 +469,23 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX11-FAKE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX11-FAKE16: ; %bb.0: ; %bb ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2 ; GFX11-FAKE16-NEXT: s_endpgm @@ -493,21 +493,21 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX11-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb ; GFX11-GISEL-TRUE16-NEXT: s_clause 0x1 -; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm @@ -515,96 +515,96 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; GFX11-GISEL-LABEL: struct_atomic_buffer_load_v4i16: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX11-GISEL-NEXT: .LBB8_1: ; %bb1 ; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-GISEL-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v3 -; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 -; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s6, v0 +; GFX11-GISEL-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb -; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 ; GFX12-SDAG-TRUE16-NEXT: s_endpgm ; ; GFX12-FAKE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX12-FAKE16: ; %bb.0: ; %bb -; GFX12-FAKE16-NEXT: s_clause 0x1 -; GFX12-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX12-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2 ; GFX12-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb -; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 -; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l ; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 ; GFX12-GISEL-TRUE16-NEXT: s_endpgm @@ -625,42 +625,42 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 ; GFX11-LABEL: struct_atomic_buffer_load_v4i32: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB9_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_atomic_buffer_load_v4i32: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB9_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -680,46 +680,46 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i ; GFX11-LABEL: struct_atomic_buffer_load_ptr: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB10_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: flat_load_b32 v2, v[2:3] +; GFX11-NEXT: flat_load_b32 v1, v[1:2] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_atomic_buffer_load_ptr: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: flat_load_b32 v2, v[2:3] +; GFX12-NEXT: flat_load_b32 v1, v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index 23db2479f66bb..7c915841caa42 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -14,42 +14,42 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB0_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB0_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -117,42 +117,42 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8 ; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_off: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB2_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_off: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB2_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -171,43 +171,43 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace( ; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_soff: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB3_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_soff: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_mov_b32 s5, 4 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_mov_b32 s6, 4 ; GFX12-NEXT: .LBB3_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], s5 idxen offset:4 th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], s6 idxen offset:4 th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -225,42 +225,42 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8 ; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_dlc: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB4_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_dlc: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB4_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -334,43 +334,43 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; GFX11-LABEL: struct_ptr_atomic_buffer_load_i64: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB6_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i64: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB6_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -390,42 +390,42 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) ; GFX11-LABEL: struct_ptr_atomic_buffer_load_v2i16: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB7_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_v2i16: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB7_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -445,23 +445,23 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX11-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb ; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm @@ -469,23 +469,23 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX11-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX11-FAKE16: ; %bb.0: ; %bb ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX11-FAKE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-FAKE16-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2 ; GFX11-FAKE16-NEXT: s_endpgm @@ -493,21 +493,21 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX11-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb ; GFX11-GISEL-TRUE16-NEXT: s_clause 0x1 -; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm @@ -515,96 +515,96 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; GFX11-GISEL-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX11-GISEL-NEXT: .LBB8_1: ; %bb1 ; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-GISEL-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v2 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v3 -; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 -; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s6, v0 +; GFX11-GISEL-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB8_1 ; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb -; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 ; GFX12-SDAG-TRUE16-NEXT: s_endpgm ; ; GFX12-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX12-FAKE16: ; %bb.0: ; %bb -; GFX12-FAKE16-NEXT: s_clause 0x1 -; GFX12-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX12-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 ; GFX12-FAKE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2 ; GFX12-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb -; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 -; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1 ; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l ; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 ; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 ; GFX12-GISEL-TRUE16-NEXT: s_endpgm @@ -625,42 +625,42 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) ; GFX11-LABEL: struct_ptr_atomic_buffer_load_v4i32: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB9_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_v4i32: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB9_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm @@ -680,46 +680,46 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p ; GFX11-LABEL: struct_ptr_atomic_buffer_load_ptr: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB10_1: ; %bb1 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: flat_load_b32 v2, v[2:3] +; GFX11-NEXT: flat_load_b32 v1, v[1:2] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_ptr: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_xcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %bb1 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: flat_load_b32 v2, v[2:3] +; GFX12-NEXT: flat_load_b32 v1, v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %bb2 ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index c1a32aafbc71e..942e037bd550b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -6831,41 +6831,40 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6880,39 +6879,38 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6927,33 +6925,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -6967,29 +6965,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7006,29 +7004,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7046,30 +7044,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7188,41 +7186,40 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7237,39 +7234,38 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7284,33 +7280,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -7324,29 +7320,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7363,29 +7359,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7403,30 +7399,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7545,40 +7541,41 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7591,39 +7588,39 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7636,34 +7633,34 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7674,35 +7671,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7712,35 +7709,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7751,36 +7748,36 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7889,40 +7886,41 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7935,39 +7933,39 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7980,34 +7978,34 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8018,35 +8016,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8056,35 +8054,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8095,36 +8093,36 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8837,21 +8835,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB28_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s8, s3, 3 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: ds_read_b32 v1, v2 +; GFX7-NEXT: s_lshl_b32 s10, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: ds_read_b32 v1, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 @@ -8867,22 +8866,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s3, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v3, v1 +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: ds_read_b32 v2, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 ; GFX7-NEXT: .LBB28_7: ; %Flow21 @@ -8917,23 +8917,24 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_13 ; GFX7-NEXT: ; %bb.10: -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v3 -; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: ds_read_b32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_11 ; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: .LBB28_13: ; %Flow19 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -8961,21 +8962,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB28_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s8, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: ds_read_b32 v1, v2 +; GFX6-NEXT: s_lshl_b32 s10, s3, 3 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 @@ -8991,22 +8993,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s3, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v3, v1 +; GFX6-NEXT: s_lshl_b32 s3, s3, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 ; GFX6-NEXT: .LBB28_7: ; %Flow19 @@ -9041,23 +9044,24 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_13 ; GFX6-NEXT: ; %bb.10: -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v3 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: ds_read_b32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_11 ; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: .LBB28_13: ; %Flow17 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -9673,21 +9677,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB29_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s8, s3, 3 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: ds_read_b32 v1, v2 +; GFX7-NEXT: s_lshl_b32 s10, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: ds_read_b32 v1, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 @@ -9703,22 +9708,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s3, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v3, v1 +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: ds_read_b32 v2, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 ; GFX7-NEXT: .LBB29_7: ; %Flow21 @@ -9753,23 +9759,24 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_13 ; GFX7-NEXT: ; %bb.10: -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v3 -; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: ds_read_b32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_11 ; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: .LBB29_13: ; %Flow19 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -9797,21 +9804,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB29_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s8, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: ds_read_b32 v1, v2 +; GFX6-NEXT: s_lshl_b32 s10, s3, 3 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 @@ -9827,22 +9835,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s3, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v3, v1 +; GFX6-NEXT: s_lshl_b32 s3, s3, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 ; GFX6-NEXT: .LBB29_7: ; %Flow19 @@ -9877,23 +9886,24 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_13 ; GFX6-NEXT: ; %bb.10: -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v3 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: ds_read_b32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_11 ; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: .LBB29_13: ; %Flow17 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 739e86d1928b1..64bf599a20d18 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -5552,15 +5552,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5582,17 +5582,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 @@ -5605,15 +5605,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5632,14 +5632,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5658,16 +5658,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 @@ -5681,16 +5681,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 @@ -5705,17 +5705,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5824,15 +5824,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5854,17 +5854,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX942-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 @@ -5877,15 +5877,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5904,14 +5904,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5930,16 +5930,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 @@ -5953,16 +5953,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 @@ -5977,17 +5977,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6096,14 +6096,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6125,13 +6125,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX942-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6147,14 +6147,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6173,13 +6173,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6198,12 +6198,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6220,12 +6220,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6241,23 +6241,23 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6358,14 +6358,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6387,13 +6387,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX942-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6409,14 +6409,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6435,13 +6435,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6460,12 +6460,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6482,12 +6482,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6503,23 +6503,23 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6626,41 +6626,40 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6680,39 +6679,38 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6729,30 +6727,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6768,41 +6766,40 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6817,39 +6814,38 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6864,33 +6860,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -6904,29 +6900,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6943,29 +6939,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6983,30 +6979,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7108,41 +7104,40 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7162,39 +7157,38 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7211,30 +7205,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX942-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7250,41 +7244,40 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7299,39 +7292,38 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7346,33 +7338,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -7386,29 +7378,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7425,29 +7417,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7465,30 +7457,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7590,40 +7582,40 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7641,39 +7633,39 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7687,36 +7679,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7726,40 +7718,41 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7772,39 +7765,39 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7817,34 +7810,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7855,35 +7848,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7893,35 +7886,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7932,36 +7925,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8053,40 +8046,40 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8104,39 +8097,39 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8150,36 +8143,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8189,40 +8182,41 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8235,39 +8229,39 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8280,34 +8274,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8318,35 +8312,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8356,35 +8350,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8395,36 +8389,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 6da80262951e5..d3ee494883ba5 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -5552,15 +5552,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5582,17 +5582,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB20_1 @@ -5605,15 +5605,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5632,14 +5632,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5658,16 +5658,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 @@ -5681,16 +5681,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 @@ -5705,17 +5705,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5824,15 +5824,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5854,17 +5854,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX942-NEXT: v_pk_max_f16 v2, v4, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX942-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB21_1 @@ -5877,15 +5877,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5904,14 +5904,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5930,16 +5930,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 @@ -5953,16 +5953,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 @@ -5977,17 +5977,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6096,14 +6096,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6125,13 +6125,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX942-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6147,14 +6147,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6173,13 +6173,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6198,12 +6198,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6220,12 +6220,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6241,23 +6241,23 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v4, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6358,14 +6358,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -6387,13 +6387,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX942-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX942-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6409,14 +6409,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6435,13 +6435,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6460,12 +6460,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6482,12 +6482,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -6503,23 +6503,23 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v4, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6626,41 +6626,40 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6680,39 +6679,38 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -6729,30 +6727,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6768,41 +6766,40 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6817,39 +6814,38 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6864,33 +6860,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -6904,29 +6900,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6943,29 +6939,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6983,30 +6979,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7108,41 +7104,40 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7162,39 +7157,38 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7211,30 +7205,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX942-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7250,41 +7244,40 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7299,39 +7292,38 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7346,33 +7338,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -7386,29 +7378,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7425,29 +7417,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7465,30 +7457,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7590,40 +7582,40 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7641,39 +7633,39 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7687,36 +7679,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7726,40 +7718,41 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7772,39 +7765,39 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7817,34 +7810,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7855,35 +7848,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7893,35 +7886,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7932,36 +7925,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8053,40 +8046,40 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8104,39 +8097,39 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8150,36 +8143,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8189,40 +8182,41 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8235,39 +8229,39 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8280,34 +8274,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8318,35 +8312,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8356,35 +8350,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8395,36 +8389,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 786989cc9fb57..58715e72532b9 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -7394,41 +7394,40 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7448,39 +7447,38 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7497,30 +7495,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7536,41 +7534,40 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7585,39 +7582,38 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7632,33 +7628,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -7672,29 +7668,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7711,29 +7707,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7751,30 +7747,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7876,41 +7872,40 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1 -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7930,39 +7925,38 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -7979,30 +7973,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX942-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX942-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -8018,41 +8012,40 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8067,39 +8060,38 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8114,33 +8106,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -8154,29 +8146,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -8193,29 +8185,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -8233,30 +8225,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -8358,40 +8350,40 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8409,39 +8401,39 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8455,36 +8447,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 +; GFX942-NEXT: ds_read_b32 v2, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB26_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8494,40 +8486,41 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8540,39 +8533,39 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -8585,34 +8578,34 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -8623,35 +8616,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8661,35 +8654,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8700,36 +8693,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8821,40 +8814,40 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8872,39 +8865,39 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff -; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE -; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe @@ -8918,36 +8911,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX942-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX942-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX942-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX942-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX942-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB27_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8957,40 +8950,41 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9003,39 +8997,39 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl0_inv -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -9048,34 +9042,34 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -9086,35 +9080,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9124,35 +9118,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9163,36 +9157,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index c92c672dda2ad..4eb9cee00be3f 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -169,12 +169,12 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB1_8 ; CHECK-NEXT: .LBB1_5: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual +; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_add_i32 s6, s8, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; CHECK-NEXT: s_mov_b64 s[8:9], 1 ; CHECK-NEXT: s_cbranch_vccnz .LBB1_6 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index cf244f0b1f884..802f534c2bf15 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -5,76 +5,72 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s0, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: s_mov_b32 s1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_cmp_lg_u32 s0, 0 -; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: s_cmp_lg_u32 s2, 0 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX942-NEXT: s_or_b32 s4, s3, 1 -; GFX942-NEXT: s_ashr_i32 s5, s3, 31 -; GFX942-NEXT: s_mov_b32 s3, s2 -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: s_or_b32 s4, s1, 1 +; GFX942-NEXT: s_ashr_i32 s5, s1, 31 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 -; GFX942-NEXT: s_and_b32 s3, s5, s4 +; GFX942-NEXT: s_and_b32 s1, s5, s4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: -; GFX942-NEXT: ; implicit-def: $sgpr3 +; GFX942-NEXT: ; implicit-def: $sgpr1 ; GFX942-NEXT: ; implicit-def: $agpr2 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ; ; GFX908-LABEL: matmul_kernel: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX908-NEXT: s_mov_b32 s2, 0 -; GFX908-NEXT: s_mov_b32 s3, 0 +; GFX908-NEXT: s_mov_b32 s0, 0 +; GFX908-NEXT: s_mov_b32 s1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_cmp_lg_u32 s0, 0 -; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX908-NEXT: s_cmp_lg_u32 s2, 0 +; GFX908-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX908-NEXT: s_or_b32 s4, s3, 1 -; GFX908-NEXT: s_ashr_i32 s5, s3, 31 -; GFX908-NEXT: s_mov_b32 s3, s2 -; GFX908-NEXT: v_mov_b32_e32 v1, s2 +; GFX908-NEXT: s_or_b32 s4, s1, 1 +; GFX908-NEXT: s_ashr_i32 s5, s1, 31 +; GFX908-NEXT: s_mov_b32 s1, s0 +; GFX908-NEXT: v_mov_b32_e32 v2, s1 ; GFX908-NEXT: s_nop 2 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a2 -; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: v_mov_b32_e32 v1, s0 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: s_and_b32 s3, s5, s4 +; GFX908-NEXT: s_and_b32 s1, s5, s4 ; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3] ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX908-NEXT: s_cbranch_vccz .LBB0_1 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: ; implicit-def: $sgpr3 +; GFX908-NEXT: ; implicit-def: $sgpr1 ; GFX908-NEXT: ; implicit-def: $agpr2 ; GFX908-NEXT: .LBB0_4: ; %common.ret ; GFX908-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 0887f41b7db97..702ea92de4f21 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -23,10 +23,8 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: ; Child Loop BB0_4 Depth 2 ; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GCN-NEXT: s_mov_b32 s12, s6 ; GCN-NEXT: s_branch .LBB0_4 ; GCN-NEXT: .LBB0_3: ; %Flow1 @@ -36,7 +34,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: .LBB0_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_lshl_b32 s12, s12, 5 ; GCN-NEXT: s_cbranch_vccz .LBB0_6 ; GCN-NEXT: ; %bb.5: ; in Loop: Header=BB0_4 Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 4addf42b27984..5e76c7d7c734f 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1747,8 +1747,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll index f497752994852..dcc2c798d6108 100644 --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -55,19 +55,17 @@ define amdgpu_kernel void @undef_v6f32(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v6f32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB4_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[6:9], v0 ; GCN-NEXT: ds_read_b64 v[10:11], v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_add_f32_e32 v3, v9, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -99,19 +97,17 @@ define amdgpu_kernel void @undef_v6i32(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v6i32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB5_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[6:9], v0 ; GCN-NEXT: ds_read_b64 v[10:11], v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_add_u32_e32 v3, v9, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -144,19 +140,17 @@ define amdgpu_kernel void @undef_v5f32(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v5f32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB6_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[5:8], v0 ; GCN-NEXT: ds_read_b32 v9, v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_add_f32_e32 v3, v8, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -187,19 +181,17 @@ define amdgpu_kernel void @undef_v5i32(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v5i32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB7_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[5:8], v0 ; GCN-NEXT: ds_read_b32 v9, v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_add_u32_e32 v3, v8, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -230,21 +222,19 @@ ret: define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v3f64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s3, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_bitcmp1_b32 s1, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GCN-NEXT: v_mov_b32_e32 v6, s0 ; GCN-NEXT: .LBB8_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[7:10], v6 ; GCN-NEXT: ds_read_b64 v[11:12], v6 offset:16 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_add_f64 v[2:3], v[9:10], v[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -252,7 +242,7 @@ define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: v_add_f64 v[0:1], v[7:8], v[0:1] ; GCN-NEXT: s_cbranch_vccnz .LBB8_1 ; GCN-NEXT: ; %bb.2: ; %ret -; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_mov_b32_e32 v6, s0 ; GCN-NEXT: ds_write_b64 v6, v[4:5] offset:16 ; GCN-NEXT: ds_write_b128 v6, v[0:3] ; GCN-NEXT: s_endpgm @@ -273,32 +263,30 @@ ret: define amdgpu_kernel void @undef_v3i64(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v3i64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s5, 0 +; GCN-NEXT: s_bitcmp1_b32 s3, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_xor_b64 s[4:5], s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NEXT: .LBB9_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[7:10], v6 ; GCN-NEXT: ds_read_b64 v[11:12], v6 offset:16 ; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_add_co_u32_e64 v0, s[2:3], v7, v0 +; GCN-NEXT: v_add_co_u32_e64 v0, s[0:1], v7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v11, v4 ; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc ; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v10, v3, vcc -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] -; GCN-NEXT: v_addc_co_u32_e64 v1, s[2:3], v8, v1, s[2:3] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN-NEXT: v_addc_co_u32_e64 v1, s[0:1], v8, v1, s[0:1] ; GCN-NEXT: s_cbranch_vccnz .LBB9_1 ; GCN-NEXT: ; %bb.2: ; %ret -; GCN-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NEXT: ds_write_b64 v6, v[4:5] offset:16 ; GCN-NEXT: ds_write_b128 v6, v[0:3] ; GCN-NEXT: s_endpgm @@ -320,25 +308,23 @@ ret: define amdgpu_kernel void @undef_v4f16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v4f16: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s3, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s1, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: .LBB10_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b64 v[3:4], v2 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_f16 v1, v4, v1 ; GCN-NEXT: v_pk_add_f16 v0, v3, v0 ; GCN-NEXT: s_cbranch_vccnz .LBB10_1 ; GCN-NEXT: ; %bb.2: ; %ret -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: ds_write_b64 v2, v[0:1] ; GCN-NEXT: s_endpgm entry: @@ -358,25 +344,23 @@ ret: define amdgpu_kernel void @undef_v4i16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v4i16: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s3, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s1, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: .LBB11_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b64 v[3:4], v2 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v1, v4, v1 ; GCN-NEXT: v_pk_add_u16 v0, v3, v0 ; GCN-NEXT: s_cbranch_vccnz .LBB11_1 ; GCN-NEXT: ; %bb.2: ; %ret -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: ds_write_b64 v2, v[0:1] ; GCN-NEXT: s_endpgm entry: @@ -397,25 +381,23 @@ ret: define amdgpu_kernel void @undef_v2f16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v2f16: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s3, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_bitcmp1_b32 s1, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: .LBB12_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: ds_read_b32 v2, v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: ds_read_b32 v2, v1 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_add_f16 v1, v2, v1 +; GCN-NEXT: v_pk_add_f16 v0, v2, v0 ; GCN-NEXT: s_cbranch_vccnz .LBB12_1 ; GCN-NEXT: ; %bb.2: ; %ret -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: ds_write_b32 v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_endpgm entry: br label %loop @@ -434,25 +416,23 @@ ret: define amdgpu_kernel void @undef_v2i16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v2i16: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s3, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_bitcmp1_b32 s1, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: .LBB13_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: ds_read_b32 v2, v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: ds_read_b32 v2, v1 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_add_u16 v1, v2, v1 +; GCN-NEXT: v_pk_add_u16 v0, v2, v0 ; GCN-NEXT: s_cbranch_vccnz .LBB13_1 ; GCN-NEXT: ; %bb.2: ; %ret -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: ds_write_b32 v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_endpgm entry: br label %loop @@ -496,17 +476,15 @@ define amdgpu_kernel void @undef_bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB15_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_u16 v1, v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v1, v0 ; GCN-NEXT: s_cbranch_vccnz .LBB15_1 @@ -534,17 +512,15 @@ define amdgpu_kernel void @undef_v2bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB16_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b32 v1, v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v0, v1, v0 ; GCN-NEXT: s_cbranch_vccnz .LBB16_1 @@ -572,19 +548,17 @@ define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB17_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_u16 v3, v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_pk_add_u16 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -615,17 +589,15 @@ define amdgpu_kernel void @undef_v4bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB18_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b64 v[2:3], v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v1, v3, v1 ; GCN-NEXT: v_pk_add_u16 v0, v2, v0 @@ -654,19 +626,17 @@ define amdgpu_kernel void @undef_v6bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v6bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB19_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b64 v[3:4], v0 ; GCN-NEXT: ds_read_b32 v5, v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_pk_add_u16 v1, v4, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -698,17 +668,15 @@ define amdgpu_kernel void @undef_v8bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB20_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[4:7], v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v3, v7, v3 ; GCN-NEXT: v_pk_add_u16 v2, v6, v2 @@ -739,18 +707,16 @@ define amdgpu_kernel void @undef_v16bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB21_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[8:11], v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v7, v11, v7 ; GCN-NEXT: v_pk_add_u16 v6, v10, v6 @@ -786,6 +752,7 @@ define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-LABEL: undef_v32bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s0, s[8:9], 0x4 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: ; implicit-def: $vgpr8 ; GCN-NEXT: ; implicit-def: $vgpr12 @@ -793,13 +760,10 @@ define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) { ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB22_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ds_read_b128 v[16:19], v0 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v15, v19, v15 ; GCN-NEXT: v_pk_add_u16 v14, v18, v14 diff --git a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll index a5299ea36958d..c509660b57995 100644 --- a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll @@ -17,32 +17,30 @@ declare float @llvm.fmuladd.f32(float, float, float) #0 define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture %arg2, float %arg3, i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) local_unnamed_addr !reqd_work_group_size !0 { ; CHECK-LABEL: foo: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s6, s[4:5], 0x10 -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x10 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; CHECK-NEXT: s_load_dword s10, s[4:5], 0x11 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CHECK-NEXT: s_movk_i32 s0, 0x54 +; CHECK-NEXT: s_movk_i32 s3, 0x54 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mad_u32_u24 v1, v1, s0, v2 +; CHECK-NEXT: v_mad_u32_u24 v1, v1, s3, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_bitcmp1_b32 s6, 8 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s6, 16 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; CHECK-NEXT: s_bitcmp1_b32 s2, 8 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s2, 16 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; CHECK-NEXT: s_bitcmp1_b32 s0, 24 ; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v2 ; CHECK-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; CHECK-NEXT: s_bitcmp1_b32 s2, 24 +; CHECK-NEXT: s_bitcmp1_b32 s1, 0 ; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_xor_b64 s[6:7], s[6:7], -1 -; CHECK-NEXT: s_bitcmp1_b32 s3, 0 -; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s10, 8 ; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 -; CHECK-NEXT: s_and_b64 s[2:3], exec, s[6:7] -; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11] -; CHECK-NEXT: s_and_b64 s[8:9], exec, s[8:9] +; CHECK-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: s_and_b64 s[4:5], exec, s[10:11] +; CHECK-NEXT: s_and_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_mov_b32 m0, -1 ; CHECK-NEXT: .LBB0_1: ; %.loopexit145 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 @@ -54,13 +52,13 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg, ; CHECK-NEXT: .LBB0_2: ; %.loopexit ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=2 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0x540, v2 -; CHECK-NEXT: s_mov_b64 vcc, s[4:5] +; CHECK-NEXT: s_mov_b64 vcc, s[2:3] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_5 ; CHECK-NEXT: .LBB0_3: ; %bb13 ; CHECK-NEXT: ; Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: ; => This Loop Header: Depth=2 ; CHECK-NEXT: ; Child Loop BB0_4 Depth 3 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_4: ; %bb21 @@ -69,17 +67,17 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg, ; CHECK-NEXT: ; => This Inner Loop Header: Depth=3 ; CHECK-NEXT: ds_write_b32 v3, v0 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 32, v3 -; CHECK-NEXT: s_mov_b64 vcc, s[2:3] +; CHECK-NEXT: s_mov_b64 vcc, s[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB0_4 ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_5: ; %bb31 ; CHECK-NEXT: ; Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: s_mov_b64 vcc, s[6:7] +; CHECK-NEXT: s_mov_b64 vcc, s[4:5] ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.6: ; %bb30 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[8:9] +; CHECK-NEXT: s_mov_b64 vcc, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_1 ; CHECK-NEXT: ; %bb.7: ; %bb11 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index e64e3def98c26..c7b690fbd4a21 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1868,8 +1868,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll index 42436a1b4c279..f62ddfcabef98 100644 --- a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll +++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll @@ -85,62 +85,65 @@ merge: define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 { ; GFX900-LABEL: test_loop_with_if: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: s_mov_b64 s[4:5], 0 -; GFX900-NEXT: s_movk_i32 s10, 0xfe +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: s_movk_i32 s12, 0xfe ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_bitcmp1_b32 s2, 0 -; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX900-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3] -; GFX900-NEXT: v_mov_b32_e32 v2, s1 -; GFX900-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; GFX900-NEXT: v_mov_b32_e32 v1, s0 -; GFX900-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v3 +; GFX900-NEXT: s_bitcmp1_b32 s6, 0 +; GFX900-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX900-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GFX900-NEXT: s_branch .LBB2_2 ; GFX900-NEXT: .LBB2_1: ; %latch ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX900-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_add_u32_e32 v5, 20, v3 -; GFX900-NEXT: v_cmp_lt_i32_e32 vcc, s10, v5 -; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX900-NEXT: flat_store_dword v[1:2], v3 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: v_add_u32_e32 v3, 20, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, s1 +; GFX900-NEXT: v_cmp_lt_i32_e32 vcc, s12, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, s0 +; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: flat_store_dword v[4:5], v1 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execz .LBB2_8 ; GFX900-NEXT: .LBB2_2: ; %loop ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: flat_load_dwordx2 v[3:4], v[1:2] -; GFX900-NEXT: s_and_b64 vcc, exec, s[0:1] -; GFX900-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_mov_b32_e32 v1, s0 +; GFX900-NEXT: flat_load_dwordx2 v[1:2], v[1:2] +; GFX900-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX900-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX900-NEXT: s_mov_b64 s[8:9], 0 ; GFX900-NEXT: s_cbranch_vccnz .LBB2_4 ; GFX900-NEXT: ; %bb.3: ; %if ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: v_cmp_gt_i32_e32 vcc, 11, v5 -; GFX900-NEXT: s_andn2_b64 s[8:9], s[2:3], exec -; GFX900-NEXT: s_and_b64 s[12:13], vcc, exec -; GFX900-NEXT: s_mov_b64 s[6:7], -1 -; GFX900-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX900-NEXT: v_cmp_gt_i32_e32 vcc, 11, v3 +; GFX900-NEXT: s_andn2_b64 s[10:11], s[6:7], exec +; GFX900-NEXT: s_and_b64 s[14:15], vcc, exec +; GFX900-NEXT: s_mov_b64 s[8:9], -1 +; GFX900-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] ; GFX900-NEXT: .LBB2_4: ; %Flow ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: s_and_saveexec_b64 s[12:13], s[8:9] -; GFX900-NEXT: s_xor_b64 s[8:9], exec, s[12:13] +; GFX900-NEXT: s_and_saveexec_b64 s[14:15], s[10:11] +; GFX900-NEXT: s_xor_b64 s[10:11], exec, s[14:15] ; GFX900-NEXT: s_cbranch_execz .LBB2_6 ; GFX900-NEXT: ; %bb.5: ; %else ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX900-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GFX900-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX900-NEXT: s_andn2_b64 s[8:9], s[8:9], exec ; GFX900-NEXT: .LBB2_6: ; %Flow1 ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX900-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX900-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX900-NEXT: s_and_saveexec_b64 s[10:11], s[8:9] ; GFX900-NEXT: s_cbranch_execz .LBB2_1 ; GFX900-NEXT: ; %bb.7: ; %then ; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX900-NEXT: flat_store_dword v[1:2], v0 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, s1 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: flat_store_dword v[2:3], v0 ; GFX900-NEXT: s_branch .LBB2_1 ; GFX900-NEXT: .LBB2_8: ; %end ; GFX900-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index d80ec6bd34945..ce3bb56e67ced 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -32,87 +32,67 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx4 s[68:71], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[44:45], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[68:69] +; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[48:49], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 -; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 +; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 -; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s70, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s54, 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[52:53], -1, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[50:51], s[4:5], v[0:1] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane +; GLOBALNESS1-NEXT: s_xor_b64 s[64:65], s[52:53], -1 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 0 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: s_xor_b64 s[66:67], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3 +; GLOBALNESS1-NEXT: s_xor_b64 s[98:99], s[4:5], -1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80 -; GLOBALNESS1-NEXT: s_mov_b32 s82, s16 -; GLOBALNESS1-NEXT: s_mov_b32 s83, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s84, s14 +; GLOBALNESS1-NEXT: s_mov_b32 s84, s16 +; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS1-NEXT: s_mov_b32 s85, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s86, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 ; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 2 -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 3 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 4 ; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 5 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 6 ; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 7 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s70, 8 -; GLOBALNESS1-NEXT: v_writelane_b32 v57, s71, 9 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[54:55], 1, v2 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s52, 8 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s53, 9 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -138,13 +118,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[70:71] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[52:53] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 @@ -152,12 +132,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], -1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s55, 1 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s71, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s71, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_7: ; %Flow26 @@ -166,7 +146,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.8: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 0 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s71, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 @@ -179,10 +159,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[68:69], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[96:97], s[68:69] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -191,7 +171,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 11 ; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 2 ; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 3 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: s_mov_b32 s87, s71 +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -201,73 +182,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[96:97], 0, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[98:99], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[70:71], 0, v[0:1] +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[80:81], 0, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_16 ; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[68:69] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[98:99] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[64:65] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[66:67] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[98:99] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[70:71] ; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[80:81] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s70, s38, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s71, s39, 0 +; GLOBALNESS1-NEXT: s_add_u32 s82, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s83, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[70:71] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[82:83] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[52:53] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[70:71] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[82:83] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[58:59], off -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[52:53] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[80:81] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -275,24 +254,22 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_branch .LBB1_14 ; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 -; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 +; GLOBALNESS1-NEXT: v_readlane_b32 s52, v57, 8 ; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9 -; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 +; GLOBALNESS1-NEXT: s_mov_b32 s71, s87 +; GLOBALNESS1-NEXT: v_readlane_b32 s53, v57, 9 ; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[96:97] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[68:69] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -315,9 +292,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 @@ -333,9 +310,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s84 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s86 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s85 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s84 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: .LBB1_33: ; %UnifiedUnreachableBlock @@ -343,87 +320,67 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[52:55], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_load_dwordx4 s[68:71], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[44:45], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[52:53] +; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[68:69] +; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 -; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 +; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 -; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s70, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s54, 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[52:53], -1, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[50:51], s[4:5], v[0:1] ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane +; GLOBALNESS0-NEXT: s_xor_b64 s[64:65], s[52:53], -1 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 0 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: s_xor_b64 s[66:67], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3 +; GLOBALNESS0-NEXT: s_xor_b64 s[98:99], s[4:5], -1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80 -; GLOBALNESS0-NEXT: s_mov_b32 s70, s16 -; GLOBALNESS0-NEXT: s_mov_b32 s71, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s82, s14 +; GLOBALNESS0-NEXT: s_mov_b32 s82, s16 +; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS0-NEXT: s_mov_b32 s83, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s84, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v2 ; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 2 -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 3 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 4 ; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 5 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 6 ; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 7 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s84, 8 -; GLOBALNESS0-NEXT: v_writelane_b32 v57, s85, 9 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[54:55], 1, v2 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s52, 8 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s53, 9 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -449,13 +406,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[84:85] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[52:53] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 @@ -463,12 +420,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], -1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS0-NEXT: s_cmp_lt_i32 s55, 1 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s71, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s71, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_7: ; %Flow26 @@ -477,7 +434,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.8: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 0 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s71, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 @@ -490,10 +447,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[86:87], 0, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[68:69], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[96:97], s[68:69] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -502,8 +459,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 11 ; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 2 ; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 3 -; GLOBALNESS0-NEXT: s_mov_b32 s83, s55 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -513,73 +469,71 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[96:97], 0, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[98:99], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[70:71], 0, v[0:1] +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[80:81], 0, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_16 ; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[68:69] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[98:99] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[64:65] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[66:67] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 0 ; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[98:99] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[70:71] ; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[80:81] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s84, s38, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s85, s39, 0 +; GLOBALNESS0-NEXT: s_add_u32 s86, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s87, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; GLOBALNESS0-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[84:85] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[86:87] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[52:53] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[84:85] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[86:87] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[58:59], off -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[52:53] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[80:81] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -587,22 +541,22 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_branch .LBB1_14 ; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s52, v57, 8 ; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s53, v57, 9 ; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[96:97] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[68:69] +; GLOBALNESS0-NEXT: s_load_dwordx4 s[68:71], s[38:39], 0x0 ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -625,9 +579,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 @@ -643,9 +597,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[48:49] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s82 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s84 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s83 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: .LBB1_33: ; %UnifiedUnreachableBlock diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index bc9a3f2389e7e..bf1f6980fe25a 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1186,8 +1186,8 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 31708a9b738db..00369bafd99b4 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -55,13 +55,10 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 { ; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; CHECK-NEXT: .LBB1_1: ; %bb9 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb11 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 464dad83f47c9..2a2ad573e708d 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1293,8 +1293,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index 0f368ffd33b9d..915729c761656 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -250,15 +250,15 @@ define amdgpu_kernel void @simple_test_v_loop(ptr addrspace(1) %dst, ptr addrspa ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s10 -; SI-NEXT: s_mov_b32 s5, s11 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: .LBB4_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -314,8 +314,6 @@ define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture % ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, s13 @@ -330,6 +328,8 @@ define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture % ; SI-NEXT: s_mov_b64 s[6:7], 0 ; SI-NEXT: .LBB5_2: ; %bb10 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, v[6:7], s[8:11], 0 addr64 ; SI-NEXT: buffer_load_dword v9, v[4:5], s[8:11], 0 addr64 @@ -355,8 +355,8 @@ define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture % ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_and_b64 s[12:13], vcc, exec -; SI-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] +; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; SI-NEXT: .LBB5_4: ; %Flow ; SI-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index d15db230524e0..221d7a9e5a702 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -6,15 +6,15 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 -; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: .LBB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 +; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] ; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll index 1a1fe0e2b19e2..96c9e7523f642 100644 --- a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll +++ b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll @@ -20,101 +20,101 @@ define void @jr_without_ra(ptr %rtwdev, ptr %chan, ptr %h2c, i8 %.pre, i1 %cmp.i ; CHECK-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill ; CHECK-NEXT: st.d $s7, $sp, 16 # 8-byte Folded Spill ; CHECK-NEXT: st.d $s8, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: move $s7, $zero -; CHECK-NEXT: move $s0, $zero +; CHECK-NEXT: move $s6, $zero +; CHECK-NEXT: move $s1, $zero ; CHECK-NEXT: ld.d $t0, $sp, 184 -; CHECK-NEXT: ld.d $s2, $sp, 176 -; CHECK-NEXT: ld.d $s1, $sp, 168 -; CHECK-NEXT: ld.d $t1, $sp, 160 -; CHECK-NEXT: ld.d $t2, $sp, 152 -; CHECK-NEXT: ld.d $t3, $sp, 144 -; CHECK-NEXT: ld.d $t4, $sp, 136 -; CHECK-NEXT: ld.d $t5, $sp, 128 -; CHECK-NEXT: ld.d $t6, $sp, 120 -; CHECK-NEXT: ld.d $t7, $sp, 112 -; CHECK-NEXT: ld.d $t8, $sp, 104 -; CHECK-NEXT: ld.d $fp, $sp, 96 +; CHECK-NEXT: ld.d $t1, $sp, 176 +; CHECK-NEXT: ld.d $s2, $sp, 168 +; CHECK-NEXT: ld.d $t2, $sp, 160 +; CHECK-NEXT: ld.d $t3, $sp, 152 +; CHECK-NEXT: ld.d $t4, $sp, 144 +; CHECK-NEXT: ld.d $t5, $sp, 136 +; CHECK-NEXT: ld.d $t6, $sp, 128 +; CHECK-NEXT: ld.d $t7, $sp, 120 +; CHECK-NEXT: ld.d $t8, $sp, 112 +; CHECK-NEXT: ld.d $fp, $sp, 104 +; CHECK-NEXT: ld.d $s0, $sp, 96 ; CHECK-NEXT: andi $a4, $a4, 1 -; CHECK-NEXT: alsl.d $a6, $a6, $s1, 4 -; CHECK-NEXT: pcalau12i $s1, %pc_hi20(.LJTI0_0) -; CHECK-NEXT: addi.d $s1, $s1, %pc_lo12(.LJTI0_0) -; CHECK-NEXT: slli.d $s3, $s2, 2 -; CHECK-NEXT: alsl.d $s2, $s2, $s3, 1 -; CHECK-NEXT: add.d $s2, $t5, $s2 -; CHECK-NEXT: addi.w $s4, $zero, -41 +; CHECK-NEXT: alsl.d $a6, $a6, $s2, 4 +; CHECK-NEXT: pcalau12i $s2, %pc_hi20(.LJTI0_0) +; CHECK-NEXT: addi.d $s2, $s2, %pc_lo12(.LJTI0_0) ; CHECK-NEXT: ori $s3, $zero, 1 -; CHECK-NEXT: slli.d $s4, $s4, 3 -; CHECK-NEXT: ori $s6, $zero, 3 -; CHECK-NEXT: lu32i.d $s6, 262144 +; CHECK-NEXT: ori $s4, $zero, 50 +; CHECK-NEXT: ori $s5, $zero, 3 +; CHECK-NEXT: lu32i.d $s5, 262144 ; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB0_1: # %sw.bb27.i.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ori $s8, $zero, 1 +; CHECK-NEXT: ori $s7, $zero, 1 ; CHECK-NEXT: .LBB0_2: # %if.else.i106 ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: alsl.d $s5, $s0, $s0, 3 -; CHECK-NEXT: alsl.d $s0, $s5, $s0, 1 -; CHECK-NEXT: add.d $s0, $t0, $s0 -; CHECK-NEXT: ldx.bu $s8, $s0, $s8 +; CHECK-NEXT: alsl.d $s8, $s1, $s1, 3 +; CHECK-NEXT: alsl.d $s1, $s8, $s1, 1 +; CHECK-NEXT: add.d $s1, $t0, $s1 +; CHECK-NEXT: ldx.bu $s7, $s1, $s7 ; CHECK-NEXT: .LBB0_3: # %phy_tssi_get_ofdm_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: st.b $zero, $t5, 0 -; CHECK-NEXT: st.b $s7, $t3, 0 -; CHECK-NEXT: st.b $zero, $t8, 0 -; CHECK-NEXT: st.b $zero, $t1, 0 -; CHECK-NEXT: st.b $zero, $a1, 0 +; CHECK-NEXT: st.b $zero, $t6, 0 +; CHECK-NEXT: st.b $s6, $t4, 0 +; CHECK-NEXT: st.b $zero, $fp, 0 ; CHECK-NEXT: st.b $zero, $t2, 0 -; CHECK-NEXT: st.b $s8, $a5, 0 -; CHECK-NEXT: ori $s0, $zero, 1 -; CHECK-NEXT: move $s7, $a3 +; CHECK-NEXT: st.b $zero, $a1, 0 +; CHECK-NEXT: st.b $zero, $t3, 0 +; CHECK-NEXT: st.b $s7, $a5, 0 +; CHECK-NEXT: ori $s1, $zero, 1 +; CHECK-NEXT: move $s6, $a3 ; CHECK-NEXT: .LBB0_4: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: beqz $a4, .LBB0_9 ; CHECK-NEXT: # %bb.5: # %calc_6g.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: move $s7, $zero +; CHECK-NEXT: move $s6, $zero ; CHECK-NEXT: bnez $zero, .LBB0_8 ; CHECK-NEXT: # %bb.6: # %calc_6g.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: slli.d $s8, $zero, 3 -; CHECK-NEXT: ldx.d $s8, $s1, $s8 -; CHECK-NEXT: jr $s8 +; CHECK-NEXT: slli.d $s7, $zero, 3 +; CHECK-NEXT: ldx.d $s7, $s2, $s7 +; CHECK-NEXT: jr $s7 ; CHECK-NEXT: .LBB0_7: # %sw.bb12.i.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ori $s7, $zero, 1 +; CHECK-NEXT: ori $s6, $zero, 1 ; CHECK-NEXT: .LBB0_8: # %if.else58.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ldx.bu $s7, $a6, $s7 +; CHECK-NEXT: ldx.bu $s6, $a6, $s6 ; CHECK-NEXT: b .LBB0_11 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB0_9: # %if.end.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: andi $s7, $s7, 255 -; CHECK-NEXT: ori $s5, $zero, 50 -; CHECK-NEXT: bltu $s5, $s7, .LBB0_15 +; CHECK-NEXT: andi $s6, $s6, 255 +; CHECK-NEXT: bltu $s4, $s6, .LBB0_15 ; CHECK-NEXT: # %bb.10: # %if.end.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: sll.d $s7, $s3, $s7 -; CHECK-NEXT: and $s8, $s7, $s6 -; CHECK-NEXT: move $s7, $fp -; CHECK-NEXT: beqz $s8, .LBB0_15 +; CHECK-NEXT: sll.d $s6, $s3, $s6 +; CHECK-NEXT: and $s7, $s6, $s5 +; CHECK-NEXT: move $s6, $s0 +; CHECK-NEXT: beqz $s7, .LBB0_15 ; CHECK-NEXT: .LBB0_11: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: move $s8, $zero -; CHECK-NEXT: st.b $zero, $t7, 0 -; CHECK-NEXT: ldx.b $ra, $s2, $t4 +; CHECK-NEXT: move $s7, $zero +; CHECK-NEXT: st.b $zero, $t8, 0 +; CHECK-NEXT: slli.d $s8, $t1, 2 +; CHECK-NEXT: alsl.d $s8, $t1, $s8, 1 +; CHECK-NEXT: add.d $s8, $t6, $s8 +; CHECK-NEXT: ldx.b $s8, $s8, $t5 ; CHECK-NEXT: st.b $zero, $a2, 0 ; CHECK-NEXT: st.b $zero, $a7, 0 -; CHECK-NEXT: st.b $zero, $t6, 0 -; CHECK-NEXT: st.b $ra, $a0, 0 +; CHECK-NEXT: st.b $zero, $t7, 0 +; CHECK-NEXT: st.b $s8, $a0, 0 ; CHECK-NEXT: bnez $s3, .LBB0_13 ; CHECK-NEXT: # %bb.12: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: addi.w $s8, $zero, -41 +; CHECK-NEXT: slli.d $s8, $s8, 3 ; CHECK-NEXT: pcalau12i $ra, %pc_hi20(.LJTI0_1) ; CHECK-NEXT: addi.d $ra, $ra, %pc_lo12(.LJTI0_1) -; CHECK-NEXT: ldx.d $s5, $ra, $s4 -; CHECK-NEXT: jr $s5 +; CHECK-NEXT: ldx.d $s8, $ra, $s8 +; CHECK-NEXT: jr $s8 ; CHECK-NEXT: .LBB0_13: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: bnez $s3, .LBB0_1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index dddcd4f107e3b..3b35434b3c55d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -491,8 +491,9 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV64-NEXT: j .LBB0_11 ; RV64-NEXT: .LBB0_8: # %vector.ph ; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 -; RV64-NEXT: slli t6, t0, 28 -; RV64-NEXT: sub t6, t6, t1 +; RV64-NEXT: slli t6, t0, 1 +; RV64-NEXT: slli s0, t0, 28 +; RV64-NEXT: sub t6, s0, t6 ; RV64-NEXT: and t6, t6, a6 ; RV64-NEXT: mv s0, a2 ; RV64-NEXT: mv s1, a4 diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll index d076cb00ad7e0..6d082802f9cd7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -353,8 +353,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov lr, r0 @@ -364,50 +364,48 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: csel r7, r2, r0, lt +; CHECK-NEXT: csel r3, r2, r0, lt ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: cmp r7, #3 +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: it ls ; CHECK-NEXT: movls r1, #3 ; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: subs r1, r1, r7 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: movw r2, #43691 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: movt r2, #43690 -; CHECK-NEXT: ldr r6, [sp, #128] -; CHECK-NEXT: movw r8, :lower16:c +; CHECK-NEXT: ldr r6, [sp, #112] +; CHECK-NEXT: movw r9, :lower16:c ; CHECK-NEXT: umull r1, r2, r1, r2 -; CHECK-NEXT: movt r8, :upper16:c +; CHECK-NEXT: adr.w r8, .LCPI1_1 ; CHECK-NEXT: movs r1, #4 -; CHECK-NEXT: @ implicit-def: $r10 ; CHECK-NEXT: @ implicit-def: $r5 ; CHECK-NEXT: @ implicit-def: $r11 -; CHECK-NEXT: mov.w r9, #12 -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: @ implicit-def: $r7 +; CHECK-NEXT: movt r9, :upper16:c +; CHECK-NEXT: mov.w r10, #12 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: add.w r1, r1, r2, lsr #1 ; CHECK-NEXT: add.w r0, r0, r2, lsr #1 -; CHECK-NEXT: bic r3, r1, #3 +; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI1_1 -; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vdup.32 q5, r0 ; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vadd.i32 q4, q0, r7 -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: strd r3, r7, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r2, r4, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: vadd.i32 q4, q0, r3 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: cmn.w r11, #4 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: cmn.w r7, #4 ; CHECK-NEXT: it le ; CHECK-NEXT: mvnle r0, #3 ; CHECK-NEXT: movw r2, #18725 ; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: movt r2, #9362 -; CHECK-NEXT: sub.w r1, r0, r11 -; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: subs r1, r0, r7 ; CHECK-NEXT: umull r2, r3, r1, r2 ; CHECK-NEXT: subs r2, r1, r3 ; CHECK-NEXT: add.w r2, r3, r2, lsr #1 @@ -415,19 +413,18 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: lsls r3, r3, #3 ; CHECK-NEXT: sub.w r2, r3, r2, lsr #2 ; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #7 +; CHECK-NEXT: adds r7, r0, #7 ; CHECK-NEXT: .LBB1_4: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: adds r5, #2 -; CHECK-NEXT: subs.w r1, r5, lr -; CHECK-NEXT: asr.w r0, r5, #31 +; CHECK-NEXT: add.w r11, r11, #2 +; CHECK-NEXT: subs.w r1, r11, lr +; CHECK-NEXT: asr.w r0, r11, #31 ; CHECK-NEXT: sbcs.w r0, r0, r12 ; CHECK-NEXT: bge.w .LBB1_28 ; CHECK-NEXT: .LBB1_6: @ %for.cond2.preheader @@ -436,36 +433,35 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Child Loop BB1_10 Depth 2 ; CHECK-NEXT: @ Child Loop BB1_12 Depth 3 ; CHECK-NEXT: @ Child Loop BB1_14 Depth 3 -; CHECK-NEXT: cmp.w r11, #2 +; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: bgt .LBB1_5 ; CHECK-NEXT: @ %bb.7: @ %for.body6.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: cmp r7, #5 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #5 ; CHECK-NEXT: bhi .LBB1_17 ; CHECK-NEXT: @ %bb.8: @ %for.body6.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: ldrd r2, r3, [sp, #120] +; CHECK-NEXT: ldrd r2, r3, [sp, #104] ; CHECK-NEXT: movs r0, #32 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r4, r6 -; CHECK-NEXT: mov r7, r12 -; CHECK-NEXT: mov r6, lr +; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r4, lr ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: mov lr, r6 -; CHECK-NEXT: mov r6, r4 -; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov lr, r4 +; CHECK-NEXT: mov r12, r6 +; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: ldr r6, [sp, #112] +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: b .LBB1_10 ; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us ; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2 -; CHECK-NEXT: add.w r11, r0, #7 +; CHECK-NEXT: adds r7, r0, #7 ; CHECK-NEXT: cmn.w r0, #4 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bge .LBB1_5 ; CHECK-NEXT: .LBB1_10: @ %for.body6.us ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 @@ -488,13 +484,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vqadd.u32 q2, q5, r1 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vcmp.u32 hi, q7, q2 -; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vldrw.u32 q2, [r8] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q2, q2, r8 -; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vqadd.u32 q2, q2, r1 +; CHECK-NEXT: add.w r1, r1, #4 +; CHECK-NEXT: vcmp.u32 hi, q6, q2 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vadd.i32 q2, q2, r9 +; CHECK-NEXT: vadd.i32 q1, q1, r10 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [q2] ; CHECK-NEXT: bne .LBB1_12 @@ -507,13 +504,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vqadd.u32 q2, q5, r1 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vcmp.u32 hi, q6, q2 -; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vldrw.u32 q2, [r8] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q2, q2, r8 -; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vqadd.u32 q2, q2, r1 +; CHECK-NEXT: add.w r1, r1, #4 +; CHECK-NEXT: vcmp.u32 hi, q5, q2 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vadd.i32 q2, q2, r9 +; CHECK-NEXT: vadd.i32 q1, q1, r10 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [q2] ; CHECK-NEXT: bne .LBB1_14 @@ -523,7 +521,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: beq .LBB1_9 ; CHECK-NEXT: @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2 -; CHECK-NEXT: eor r1, r10, #1 +; CHECK-NEXT: eor r1, r5, #1 ; CHECK-NEXT: lsls r1, r1, #31 ; CHECK-NEXT: bne .LBB1_9 ; CHECK-NEXT: b .LBB1_26 @@ -532,11 +530,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: beq.w .LBB1_2 ; CHECK-NEXT: @ %bb.18: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: .LBB1_19: @ %for.body6.us60 ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lsls.w r1, r10, #31 +; CHECK-NEXT: lsls r1, r5, #31 ; CHECK-NEXT: bne .LBB1_27 ; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63 ; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2 @@ -552,19 +550,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: bgt .LBB1_25 ; CHECK-NEXT: @ %bb.23: @ %for.cond.cleanup17.us63.3 ; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2 -; CHECK-NEXT: add.w r11, r0, #28 +; CHECK-NEXT: add.w r7, r0, #28 ; CHECK-NEXT: cmn.w r0, #25 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: blt .LBB1_19 ; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #14 +; CHECK-NEXT: add.w r7, r0, #14 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #21 +; CHECK-NEXT: add.w r7, r0, #21 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_26: @ %for.inc19.us ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -574,7 +572,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: b .LBB1_27 ; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index e8b49c1067379..e858540832607 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -594,71 +594,71 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #28 -; CHECK-NEXT: sub sp, #28 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: blt .LBB13_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: add r2, sp, #12 -; CHECK-NEXT: mov.w r9, #8 +; CHECK-NEXT: mov.w r10, #8 ; CHECK-NEXT: bic r1, r1, #7 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: sub.w r3, r1, #8 -; CHECK-NEXT: add.w r8, r6, r3, lsr #3 -; CHECK-NEXT: adr r3, .LCPI13_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: sub.w r7, r1, #8 +; CHECK-NEXT: add.w r0, r6, r7, lsr #3 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: .LBB13_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 -; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: adr r1, .LCPI13_0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: .LBB13_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q1, [r2] -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: vldrh.s32 q2, [r2, #8] -; CHECK-NEXT: vadd.i16 q1, q1, r9 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r7, r5, d5 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: vldrh.s32 q2, [r2] -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r1, r10, d5 -; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vadd.i16 q0, q0, r10 +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vldrh.s32 q1, [r0] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vmov r12, r11, d3 +; CHECK-NEXT: ldrh.w r8, [r6] +; CHECK-NEXT: vmov r2, r6, d2 ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh.w r2, [r10] -; CHECK-NEXT: ldrh.w r10, [r3] -; CHECK-NEXT: vmov r3, r11, d4 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh.w r11, [r11] -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r11 -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: vmov.16 q2[4], r10 -; CHECK-NEXT: vmov.16 q2[5], r4 -; CHECK-NEXT: vmov.16 q2[6], r7 -; CHECK-NEXT: vmov.16 q2[7], r5 -; CHECK-NEXT: vstrb.8 q2, [r6], #16 +; CHECK-NEXT: ldrh.w r1, [r11] +; CHECK-NEXT: ldrh.w r11, [r5] +; CHECK-NEXT: ldrh.w r5, [r12] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q1[1], r6 +; CHECK-NEXT: vmov.16 q1[2], r5 +; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: vmov.16 q1[4], r11 +; CHECK-NEXT: vmov.16 q1[5], r4 +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.16 q1[7], r8 +; CHECK-NEXT: vstrb.8 q1, [r7], #16 ; CHECK-NEXT: le lr, .LBB13_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r3, r1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: bne .LBB13_2 ; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #28 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: @@ -711,145 +711,144 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #136 -; CHECK-NEXT: sub sp, #136 +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: blt.w .LBB14_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI14_2 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: movs r2, #1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add r4, sp, #72 +; CHECK-NEXT: add r7, sp, #40 +; CHECK-NEXT: add r5, sp, #56 ; CHECK-NEXT: bic r1, r1, #7 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q2, #0x18 -; CHECK-NEXT: add.w r1, r2, r1, lsr #3 -; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill -; CHECK-NEXT: adr r1, .LCPI14_0 -; CHECK-NEXT: adr r2, .LCPI14_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: add r2, sp, #120 -; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill +; CHECK-NEXT: vmov.i16 q6, #0x18 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: add.w r1, r3, r1, lsr #3 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: add.w r10, sp, #104 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload +; CHECK-NEXT: adr r1, .LCPI14_2 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: adr r1, .LCPI14_1 +; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: .LBB14_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q5, [r2] -; CHECK-NEXT: mov r8, r2 -; CHECK-NEXT: vldrh.s32 q0, [r2, #8] +; CHECK-NEXT: vstrw.32 q1, [r4] +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: vldrh.s32 q0, [r4, #8] +; CHECK-NEXT: mov r11, r4 +; CHECK-NEXT: mov r5, r7 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vldrh.s32 q0, [r2] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vmov r6, r2, d4 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh.w r12, [r4] -; CHECK-NEXT: add r4, sp, #88 -; CHECK-NEXT: ldrh.w r11, [r5] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r5, [r6] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vstrw.32 q6, [r4] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r6, r10, d1 ; CHECK-NEXT: vldrh.s32 q0, [r4] -; CHECK-NEXT: vmov.16 q7[0], r5 -; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q6, q0, r0 +; CHECK-NEXT: vmov r7, r4, d12 +; CHECK-NEXT: ldrh.w r9, [r2] +; CHECK-NEXT: ldrh.w r2, [r10] +; CHECK-NEXT: str r2, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: ldrh.w r8, [r3] +; CHECK-NEXT: ldrh r3, [r6] +; CHECK-NEXT: ldrh r2, [r7] +; CHECK-NEXT: mov r7, r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vstrw.32 q3, [r7] +; CHECK-NEXT: vldrh.s32 q0, [r7] +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.16 q4[1], r4 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, r9, d0 -; CHECK-NEXT: vmov r2, r5, d1 -; CHECK-NEXT: vldrh.s32 q0, [r4, #8] +; CHECK-NEXT: vmov r4, r6, d0 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vldrh.s32 q0, [r7, #8] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q1[0], r6 -; CHECK-NEXT: ldrh.w r6, [r9] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q1[1], r6 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: vmov.16 q1[3], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: ldrh r4, [r6] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, r5, d1 -; CHECK-NEXT: vmov.16 q1[5], r6 -; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov.16 q5[2], r1 +; CHECK-NEXT: vmov r1, r4, d0 +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q5[4], r1 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vstrw.32 q4, [r10] -; CHECK-NEXT: vldrh.s32 q0, [r6] -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: vstrw.32 q2, [r5] +; CHECK-NEXT: vldrh.s32 q0, [r5] +; CHECK-NEXT: vmov.16 q5[6], r1 +; CHECK-NEXT: vmov.16 q5[7], r2 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r5, d0 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.16 q3[1], r5 -; CHECK-NEXT: vmov r2, r5, d5 -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload -; CHECK-NEXT: vadd.i16 q6, q6, q2 -; CHECK-NEXT: vadd.i16 q5, q5, q2 -; CHECK-NEXT: vadd.i16 q4, q4, q2 -; CHECK-NEXT: ldrh.w r9, [r2] +; CHECK-NEXT: vmov.16 q7[0], r1 +; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov r1, r2, d13 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vadd.i16 q3, q3, q6 +; CHECK-NEXT: vadd.i16 q1, q1, q6 +; CHECK-NEXT: vadd.i16 q2, q2, q6 +; CHECK-NEXT: ldrh.w r10, [r2] ; CHECK-NEXT: vmov r2, r4, d1 -; CHECK-NEXT: vldrh.s32 q0, [r6, #8] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q7[2], r9 +; CHECK-NEXT: vldrh.s32 q0, [r5, #8] +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vmov.16 q7[3], r5 +; CHECK-NEXT: vmov.16 q4[2], r1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.16 q7[4], r1 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov.16 q7[6], r12 -; CHECK-NEXT: vmov.16 q7[7], r11 +; CHECK-NEXT: vmov.16 q4[3], r10 +; CHECK-NEXT: vmov.16 q4[4], r9 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: vmov.16 q4[5], r8 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vmov.16 q4[7], r1 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.16 q3[3], r4 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov.16 q7[3], r4 ; CHECK-NEXT: vmov r2, r4, d0 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vmov.16 q7[5], r4 ; CHECK-NEXT: vmov r2, r4, d1 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: vmov.16 q3[7], r4 -; CHECK-NEXT: vadd.i16 q0, q3, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q7 -; CHECK-NEXT: vstrb.8 q0, [r7], #16 +; CHECK-NEXT: vmov.16 q7[6], r2 +; CHECK-NEXT: vmov.16 q7[7], r4 +; CHECK-NEXT: mov r4, r11 +; CHECK-NEXT: vadd.i16 q0, q7, q5 +; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vstrb.8 q0, [r12], #16 ; CHECK-NEXT: le lr, .LBB14_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: bne.w .LBB14_2 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: add sp, #88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -925,260 +924,246 @@ for.cond.cleanup: ; preds = %for.body, %middle.b define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: gather_inc_v16i8_complex: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB15_1: @ %vector.ph.preheader ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #312 -; CHECK-NEXT: sub sp, #312 -; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill -; CHECK-NEXT: blt.w .LBB15_5 -; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r1, .LCPI15_0 -; CHECK-NEXT: adr r6, .LCPI15_8 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI15_1 -; CHECK-NEXT: adr r7, .LCPI15_7 -; CHECK-NEXT: adr r3, .LCPI15_6 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI15_5 -; CHECK-NEXT: bic r10, r2, #7 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI15_9 -; CHECK-NEXT: vmov.i32 q2, #0x30 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: .pad #160 +; CHECK-NEXT: sub sp, #160 +; CHECK-NEXT: bic lr, r2, #7 +; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: vmov.i32 q0, #0x30 ; CHECK-NEXT: .LBB15_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB15_3 Depth 2 +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: adr r1, .LCPI15_1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI15_8 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: adr r1, .LCPI15_7 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: adr r1, .LCPI15_9 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI15_5 +; CHECK-NEXT: mov r9, lr +; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI15_6 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: adr r1, .LCPI15_3 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q6, [r1] ; CHECK-NEXT: adr r1, .LCPI15_4 -; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vldrw.u32 q7, [r1] ; CHECK-NEXT: adr r1, .LCPI15_2 -; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r1] ; CHECK-NEXT: adr r1, .LCPI15_10 -; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: adr r1, .LCPI15_11 -; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r1] -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: adr r1, .LCPI15_11 +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: .LBB15_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill -; CHECK-NEXT: vmov r1, lr, d8 -; CHECK-NEXT: vadd.i32 q7, q7, r0 -; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vadd.i32 q6, q0, r0 -; CHECK-NEXT: vmov r6, r7, d13 -; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q6, r0 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q6, q3, r0 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q3, q1, r0 +; CHECK-NEXT: vmov r10, r1, d15 +; CHECK-NEXT: vmov r7, r11, d6 +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov r5, r3, d13 +; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: subs.w r9, r9, #16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: subs.w r11, r11, #16 -; CHECK-NEXT: ldrb.w r9, [r1] -; CHECK-NEXT: vmov r1, r3, d14 +; CHECK-NEXT: ldrb r6, [r1] +; CHECK-NEXT: ldrb r1, [r7] +; CHECK-NEXT: vmov r7, r4, d12 ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[0], r1 -; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[1], r1 -; CHECK-NEXT: vmov r1, r3, d12 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q6[0], r7 +; CHECK-NEXT: vmov.8 q6[1], r4 +; CHECK-NEXT: vmov.8 q6[2], r5 +; CHECK-NEXT: vmov r4, r5, d14 +; CHECK-NEXT: vmov.8 q6[3], r3 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r7, [r5] +; CHECK-NEXT: vmov.8 q7[0], r4 +; CHECK-NEXT: ldrb.w r5, [r10] +; CHECK-NEXT: vmov.8 q7[1], r7 +; CHECK-NEXT: ldrb.w r7, [r11] ; CHECK-NEXT: vmov.8 q7[2], r5 -; CHECK-NEXT: ldrb r5, [r6] -; CHECK-NEXT: ldrb r6, [r4] +; CHECK-NEXT: vmov r5, r10, d5 ; CHECK-NEXT: vmov.8 q7[3], r6 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[0], r1 -; CHECK-NEXT: vmov r6, r1, d2 -; CHECK-NEXT: vmov.8 q6[1], r3 -; CHECK-NEXT: vmov.8 q6[2], r5 -; CHECK-NEXT: vmov.8 q6[3], r7 -; CHECK-NEXT: ldrb.w r7, [lr] -; CHECK-NEXT: vmov.8 q6[4], r9 -; CHECK-NEXT: vmov.8 q6[5], r7 -; CHECK-NEXT: ldrb r4, [r1] -; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, r3, d9 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[6], r1 -; CHECK-NEXT: vmov r1, r7, d0 -; CHECK-NEXT: vmov.8 q6[7], r3 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov r3, r4, d4 ; CHECK-NEXT: vmov.8 q7[4], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.8 q7[5], r7 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: ldrb r6, [r5] +; CHECK-NEXT: vmov r1, r5, d7 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q3, q3, q0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q7[6], r1 -; CHECK-NEXT: ldrb r1, [r6] -; CHECK-NEXT: vmov r7, r6, d0 -; CHECK-NEXT: vmov.8 q7[7], r3 -; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[8], r1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.8 q7[9], r4 -; CHECK-NEXT: vmov r4, r1, d0 -; CHECK-NEXT: vmov.8 q7[10], r12 -; CHECK-NEXT: vmov.8 q7[11], r5 +; CHECK-NEXT: vmov r1, r7, d2 +; CHECK-NEXT: vmov.8 q7[7], r5 +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q6[4], r1 +; CHECK-NEXT: vmov r1, r5, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q6[5], r7 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q6[6], r1 +; CHECK-NEXT: ldrb r1, [r3] +; CHECK-NEXT: vmov.8 q6[7], r5 +; CHECK-NEXT: vmov r3, r7, d2 +; CHECK-NEXT: vmov.8 q6[8], r1 +; CHECK-NEXT: vmov r1, r11, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q6[9], r4 +; CHECK-NEXT: vmov.8 q6[10], r6 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: ldrb r4, [r7] +; CHECK-NEXT: ldrb.w r7, [r10] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q6[11], r7 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[8], r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov.8 q6[9], r1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q6[10], r5 -; CHECK-NEXT: vmov.8 q6[11], r4 -; CHECK-NEXT: vmov.8 q6[12], r7 -; CHECK-NEXT: vmov.8 q6[13], r6 -; CHECK-NEXT: vmov.8 q6[14], r3 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q7[8], r5 +; CHECK-NEXT: vmov r5, r7, d3 +; CHECK-NEXT: vmov.8 q7[9], r6 +; CHECK-NEXT: vadd.i32 q1, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, q0 +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q7[10], r5 +; CHECK-NEXT: vmov.8 q7[11], r7 +; CHECK-NEXT: vmov.8 q7[12], r3 +; CHECK-NEXT: vmov.8 q7[13], r4 +; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[12], r1 +; CHECK-NEXT: vmov.8 q6[12], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov.8 q6[13], r1 +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vadd.i32 q5, q5, q0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov.8 q6[14], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: ldrb.w r1, [lr] ; CHECK-NEXT: vmov.8 q6[15], r1 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vadd.i8 q6, q6, q7 +; CHECK-NEXT: ldrb.w r1, [r11] +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: vadd.i8 q6, q7, q6 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q7[0], r1 ; CHECK-NEXT: vmov.8 q7[1], r3 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[2], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[3], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[4], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[5], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vadd.i32 q5, q5, q2 -; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vadd.i32 q1, q4, r0 +; CHECK-NEXT: vadd.i32 q4, q4, q0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[6], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[7], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[8], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[9], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q4, r0 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[10], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[11], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[12], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vmov r1, r3, d3 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[14], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: vadd.i8 q0, q6, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q7, q7, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i8 q1, q6, q7 +; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q1, [r8], #16 +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q6, q6, q0 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill ; CHECK-NEXT: bne.w .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1 -; CHECK-NEXT: cmp r10, r2 +; CHECK-NEXT: cmp lr, r2 ; CHECK-NEXT: bne.w .LBB15_2 -; CHECK-NEXT: .LBB15_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #312 +; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI15_0: @@ -1287,102 +1272,95 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #56] @ 8-byte Folded Spill -; CHECK-NEXT: blt.w .LBB16_5 +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: blt .LBB16_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r5, .LCPI16_3 -; CHECK-NEXT: adr r7, .LCPI16_1 -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI16_0 -; CHECK-NEXT: adr r6, .LCPI16_2 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: bic r9, r1, #7 -; CHECK-NEXT: vldrw.u32 q3, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: mov.w lr, #16 -; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov.w r11, #16 +; CHECK-NEXT: bic r3, r1, #7 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB16_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 -; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: adr r1, .LCPI16_3 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: adr r1, .LCPI16_1 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: adr r1, .LCPI16_2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI16_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB16_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q5, r0 -; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 -; CHECK-NEXT: vadd.i32 q6, q0, lr -; CHECK-NEXT: vmov r5, r6, d5 -; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 -; CHECK-NEXT: vadd.i32 q1, q7, lr -; CHECK-NEXT: vadd.i32 q4, q4, lr -; CHECK-NEXT: vadd.i32 q5, q5, lr -; CHECK-NEXT: ldrb.w r11, [r3] -; CHECK-NEXT: ldrb r3, [r7] -; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q7, q0, r0 +; CHECK-NEXT: vadd.i32 q6, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vmov r4, r5, d13 +; CHECK-NEXT: vadd.i32 q3, q5, r11 +; CHECK-NEXT: vmov lr, r8, d4 +; CHECK-NEXT: subs r3, #16 +; CHECK-NEXT: vmov r6, r12, d5 +; CHECK-NEXT: vadd.i32 q2, q4, r11 +; CHECK-NEXT: vadd.i32 q1, q1, r11 +; CHECK-NEXT: vadd.i32 q0, q0, r11 +; CHECK-NEXT: ldrb.w r10, [r5] +; CHECK-NEXT: vmov r2, r5, d12 +; CHECK-NEXT: vadd.i32 q6, q5, r0 +; CHECK-NEXT: vadd.i32 q5, q4, r0 +; CHECK-NEXT: ldrb.w r1, [r8] +; CHECK-NEXT: ldrb.w r9, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: ldrb.w r6, [lr] +; CHECK-NEXT: ldrb.w r12, [r12] +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q4[0], r2 +; CHECK-NEXT: vmov.8 q4[1], r5 +; CHECK-NEXT: vmov r8, r5, d11 +; CHECK-NEXT: vmov.8 q4[2], r9 +; CHECK-NEXT: vmov.8 q4[3], r10 +; CHECK-NEXT: vmov.8 q4[4], r6 +; CHECK-NEXT: vmov.8 q4[5], r1 +; CHECK-NEXT: vmov.8 q4[6], r4 +; CHECK-NEXT: vmov r4, r6, d10 +; CHECK-NEXT: vmov.8 q4[7], r12 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: ldrb.w lr, [r5] +; CHECK-NEXT: vmov r5, r2, d13 ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w r10, [r10] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb.w r1, [r12] -; CHECK-NEXT: vmov.8 q0[0], r7 -; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 -; CHECK-NEXT: vmov.8 q0[2], r5 -; CHECK-NEXT: vmov.8 q0[3], r6 -; CHECK-NEXT: vmov.8 q0[4], r4 -; CHECK-NEXT: vmov r4, r2, d4 -; CHECK-NEXT: vmov.8 q0[5], r10 -; CHECK-NEXT: vmov.8 q0[6], r3 -; CHECK-NEXT: vmov.8 q0[7], r11 -; CHECK-NEXT: ldrb r6, [r7] -; CHECK-NEXT: vmov r5, r7, d5 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q4[8], r4 +; CHECK-NEXT: vmov.8 q4[9], r6 +; CHECK-NEXT: ldrb.w r9, [r2] +; CHECK-NEXT: vmov r1, r2, d12 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w r10, [r2] +; CHECK-NEXT: ldrb.w r2, [r8] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r5] -; CHECK-NEXT: ldrb.w r12, [r7] -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q0[8], r4 -; CHECK-NEXT: vmov.8 q0[9], r7 -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: vmov.8 q0[11], r6 -; CHECK-NEXT: vmov.8 q0[12], r5 -; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r12 -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vmov.8 q4[10], r2 +; CHECK-NEXT: vmov.8 q4[11], lr +; CHECK-NEXT: vmov.8 q4[12], r1 +; CHECK-NEXT: vmov.8 q4[13], r10 +; CHECK-NEXT: vmov.8 q4[14], r5 +; CHECK-NEXT: vmov.8 q4[15], r9 +; CHECK-NEXT: vstrb.8 q4, [r7], #16 +; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 -; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: cmp r9, r1 +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: bne .LBB16_2 ; CHECK-NEXT: .LBB16_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #64 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index eedca2cd4a5d3..b548a6ee99412 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -602,60 +602,57 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r0, [sp, #136] +; CHECK-NEXT: ldrne r0, [sp, #112] ; CHECK-NEXT: cmpne r0, #0 ; CHECK-NEXT: bne .LBB10_2 ; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader -; CHECK-NEXT: ldr.w r12, [sp, #140] +; CHECK-NEXT: ldr.w r12, [sp, #116] ; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r2, r12, #3 -; CHECK-NEXT: subs r3, r2, #4 -; CHECK-NEXT: add.w r0, r7, r3, lsr #2 -; CHECK-NEXT: ldr r7, [sp, #136] -; CHECK-NEXT: adr r3, .LCPI10_0 -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: lsl.w r0, r12, #1 -; CHECK-NEXT: vdup.32 q1, r7 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: lsls r6, r7, #1 -; CHECK-NEXT: vshl.i32 q3, q1, #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: bic r0, r12, #3 +; CHECK-NEXT: subs r3, r0, #4 +; CHECK-NEXT: add.w r3, r7, r3, lsr #2 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr r3, [sp, #112] +; CHECK-NEXT: lsl.w r7, r12, #1 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: vdup.32 q1, r3 +; CHECK-NEXT: lsls r6, r3, #1 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: b .LBB10_5 ; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r3, r0, r5, lsl #1 +; CHECK-NEXT: add.w r3, r2, r8, lsl #1 ; CHECK-NEXT: wlstp.8 lr, r6, .LBB10_4 ; CHECK-NEXT: b .LBB10_15 ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r11, r12 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add r3, r0 -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: cmp r3, r0 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload +; CHECK-NEXT: add r7, r12 +; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: add r5, r3 +; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: beq .LBB10_1 ; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -663,21 +660,22 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_15 Depth 2 -; CHECK-NEXT: mul r5, r3, r7 +; CHECK-NEXT: ldr r3, [sp, #112] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mul r8, r5, r3 ; CHECK-NEXT: beq .LBB10_3 ; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: b .LBB10_8 ; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r8, r5 -; CHECK-NEXT: add.w r8, r8, #1 -; CHECK-NEXT: cmp r8, r7 -; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1] +; CHECK-NEXT: add.w r3, r9, r8 +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: strh.w r10, [r2, r3, lsl #1] +; CHECK-NEXT: ldr r3, [sp, #112] +; CHECK-NEXT: cmp r9, r3 ; CHECK-NEXT: beq .LBB10_4 ; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 @@ -692,46 +690,48 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: b .LBB10_13 ; CHECK-NEXT: .LBB10_10: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vmlas.i32 q5, q2, r8 -; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: adr r3, .LCPI10_0 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: vmlas.i32 q4, q5, r9 +; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: .LBB10_11: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vadd.i32 q5, q4, q2 +; CHECK-NEXT: vldrh.s32 q6, [r1, q4, uxtw #1] +; CHECK-NEXT: vldrh.s32 q4, [r3], #8 +; CHECK-NEXT: vmul.i32 q4, q6, q4 +; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: vaddv.u32 r10, q4 -; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: cmp r2, r12 +; CHECK-NEXT: vaddv.u32 r10, q3 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: cmp r0, r12 ; CHECK-NEXT: beq .LBB10_7 ; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: mla r3, r7, r4, r8 -; CHECK-NEXT: add.w r0, r11, r4 -; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #112] ; CHECK-NEXT: sub.w lr, r12, r4 -; CHECK-NEXT: add.w r9, r7, r0, lsl #1 -; CHECK-NEXT: ldr r7, [sp, #136] -; CHECK-NEXT: add.w r3, r1, r3, lsl #1 +; CHECK-NEXT: ldr r5, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mla r3, r3, r4, r9 +; CHECK-NEXT: add.w r11, r1, r3, lsl #1 +; CHECK-NEXT: adds r3, r7, r4 +; CHECK-NEXT: add.w r3, r5, r3, lsl #1 ; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldrsh.w r4, [r3] -; CHECK-NEXT: add r3, r6 -; CHECK-NEXT: ldrsh r0, [r9], #2 -; CHECK-NEXT: smlabb r10, r4, r0, r10 +; CHECK-NEXT: ldrsh.w r5, [r11] +; CHECK-NEXT: add r11, r6 +; CHECK-NEXT: ldrsh r4, [r3], #2 +; CHECK-NEXT: smlabb r10, r5, r4, r10 ; CHECK-NEXT: le lr, .LBB10_14 ; CHECK-NEXT: b .LBB10_7 ; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1