diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 768c0abd2e3f1..9cfa85d47029b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -91,64 +91,73 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + + const SIRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetRegisterClass *V32RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(32); + addRegisterClass(MVT::f32, V32RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - const SIRegisterInfo *TRI = STI.getRegisterInfo(); - const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); + const TargetRegisterClass *V64RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(64); addRegisterClass(MVT::f64, V64RegClass); addRegisterClass(MVT::v2f32, V64RegClass); addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96)); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128)); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160)); addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); - addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass); + addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224)); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); - addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass); + addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288)); addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); - addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass); + addRegisterClass(MVT::v10f32, + TRI->getDefaultVectorSuperClassForBitWidth(320)); addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); - addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass); + addRegisterClass(MVT::v11f32, + TRI->getDefaultVectorSuperClassForBitWidth(352)); addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); - addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass); + addRegisterClass(MVT::v12f32, + TRI->getDefaultVectorSuperClassForBitWidth(384)); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, + TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v16f64, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { if (Subtarget->useRealTrue16Insts()) { @@ -180,7 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); computeRegisterProperties(Subtarget->getRegisterInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ecf3aee6048cd..0a5b51a33f02d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3557,6 +3557,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { : getAnyVectorSuperClassForBitWidth(BitWidth); } +const TargetRegisterClass * +SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const { + // TODO: In principle this should use AV classes for gfx908 too. This is + // limited to 90a+ to avoid regressing special case copy optimizations which + // need new handling. The core issue is that it's not possible to directly + // copy between AGPRs on gfx908, and the current optimizations around that + // expect to see copies to VGPR. + return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth) + : getVGPRClassForBitWidth(BitWidth); +} + const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth == 16 || BitWidth == 32) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index cd4dc9bc4d037..365a76d4a5006 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -215,6 +215,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY + const TargetRegisterClass * + getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 003aa049b2d1b..ae83766cd6a4a 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -9181,12 +9181,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9209,12 +9209,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9230,20 +9230,20 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fsub_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9256,20 +9256,20 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fsub_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9304,13 +9304,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9334,13 +9334,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9356,22 +9356,22 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9384,22 +9384,22 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmax_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9434,13 +9434,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9464,13 +9464,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9486,22 +9486,22 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9514,22 +9514,22 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmin_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB122_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9565,13 +9565,13 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9594,12 +9594,12 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9615,49 +9615,49 @@ define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9693,13 +9693,13 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9722,12 +9722,12 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9743,49 +9743,49 @@ define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fminimum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10009,10 +10009,10 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -10020,39 +10020,39 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB128_6 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v5 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB128_3 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB128_3: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB128_5 ; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB128_5: ; %Flow1 ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB128_6: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB128_8 ; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc -; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: .LBB128_8: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] @@ -10095,14 +10095,14 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB129_4: ; %Flow3 @@ -10154,14 +10154,14 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB129_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB129_4: ; %Flow3 @@ -10195,51 +10195,51 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fsub_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB130_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB130_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB130_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB130_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB130_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB130_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10249,48 +10249,48 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB130_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB130_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB130_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB130_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB130_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -10710,14 +10710,14 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB135_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB135_4: ; %Flow2 @@ -10778,14 +10778,14 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB135_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB135_4: ; %Flow2 @@ -10823,59 +10823,59 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB136_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB136_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB136_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB136_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB136_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB136_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10885,58 +10885,58 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB136_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB136_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB136_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB136_4: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB136_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB136_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -10978,14 +10978,14 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB137_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB137_4: ; %Flow2 @@ -11046,14 +11046,14 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB137_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB137_4: ; %Flow2 @@ -11091,59 +11091,59 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB138_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB138_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB138_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB138_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB138_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB138_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11153,58 +11153,58 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB138_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB138_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB138_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB138_4: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB138_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB138_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -17456,26 +17456,27 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB225_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17485,26 +17486,27 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB225_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v0, v1, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17521,20 +17523,20 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17548,20 +17550,20 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17582,29 +17584,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17614,29 +17616,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17653,29 +17655,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17683,29 +17685,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB228_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -17720,29 +17722,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17752,29 +17754,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17791,29 +17793,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17821,29 +17823,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB230_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -17858,29 +17860,30 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17890,26 +17893,27 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17926,29 +17930,30 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17956,20 +17961,20 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17990,29 +17995,30 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -18022,26 +18028,27 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -18058,29 +18065,30 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -18088,20 +18096,20 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18393,14 +18401,14 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB237_6 ; GFX90A-NEXT: .LBB237_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -18451,14 +18459,14 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB237_6 ; GFX950-NEXT: .LBB237_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -18496,43 +18504,43 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB238_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB238_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB238_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB238_6 ; GFX90A-NEXT: .LBB238_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB238_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB238_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -18547,40 +18555,40 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB238_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB238_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB238_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB238_6 ; GFX950-NEXT: .LBB238_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB238_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB238_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -18988,14 +18996,14 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB243_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB243_6 ; GFX90A-NEXT: .LBB243_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19055,14 +19063,14 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB243_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB243_6 ; GFX950-NEXT: .LBB243_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19104,51 +19112,51 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB244_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB244_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB244_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB244_6 ; GFX90A-NEXT: .LBB244_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB244_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB244_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -19163,50 +19171,50 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB244_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB244_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB244_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB244_6 ; GFX950-NEXT: .LBB244_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB244_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB244_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -19248,14 +19256,14 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB245_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB245_6 ; GFX90A-NEXT: .LBB245_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19315,14 +19323,14 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB245_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB245_6 ; GFX950-NEXT: .LBB245_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19364,51 +19372,51 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB246_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB246_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB246_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB246_6 ; GFX90A-NEXT: .LBB246_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB246_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB246_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -19423,50 +19431,50 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB246_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB246_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB246_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB246_6 ; GFX950-NEXT: .LBB246_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB246_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB246_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index 34a4899123749..c3531f16248e9 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -6023,12 +6023,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6051,12 +6051,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6072,20 +6072,20 @@ define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fsub_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6098,20 +6098,20 @@ define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fsub_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6146,13 +6146,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6176,13 +6176,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6198,22 +6198,22 @@ define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmax_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6226,22 +6226,22 @@ define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fmax_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6276,13 +6276,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6306,13 +6306,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6328,22 +6328,22 @@ define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmin_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6356,22 +6356,22 @@ define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fmin_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB122_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6407,13 +6407,13 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6436,12 +6436,12 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6457,49 +6457,49 @@ define void @global_atomic_fmaximum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmaximum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6535,13 +6535,13 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6564,12 +6564,12 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6585,49 +6585,49 @@ define void @global_atomic_fminimum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fminimum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6740,14 +6740,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -6770,14 +6770,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB129_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -6796,23 +6796,23 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB130_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6822,23 +6822,23 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: v_add_f64 v[2:3], v[4:5], -v[6:7] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB130_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -7020,14 +7020,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB135_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7055,14 +7055,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB135_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7080,28 +7080,28 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[10:11], v[2:3] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB136_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7110,29 +7110,29 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[4:5] -; GFX950-NEXT: v_max_f64 v[4:5], v[10:11], v[2:3] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB136_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -7164,14 +7164,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB137_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7199,14 +7199,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB137_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7224,28 +7224,28 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[4:5], v[10:11], v[2:3] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 glc +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB138_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7254,29 +7254,29 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[4:5] -; GFX950-NEXT: v_min_f64 v[4:5], v[10:11], v[2:3] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB138_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -11796,12 +11796,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11825,12 +11825,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11846,54 +11846,54 @@ define void @global_atomic_fsub_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fsub_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v0, v1, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -11923,13 +11923,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11954,13 +11954,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11976,58 +11976,58 @@ define void @global_atomic_fmax_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fmax_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmax_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB228_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12057,13 +12057,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12088,13 +12088,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12110,58 +12110,58 @@ define void @global_atomic_fmin_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fmin_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmin_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB230_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12192,13 +12192,13 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12222,12 +12222,12 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12243,57 +12243,57 @@ define void @global_atomic_fmaximum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fmaximum_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v1 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v2, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12324,13 +12324,13 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12354,12 +12354,12 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12375,57 +12375,57 @@ define void @global_atomic_fminimum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fminimum_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_min_f32_e32 v3, v5, v1 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v2, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12536,14 +12536,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12567,14 +12567,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12590,54 +12590,54 @@ define void @global_atomic_fsub_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fsub_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB238_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB238_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB238_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB238_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12826,14 +12826,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB243_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12862,14 +12862,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB243_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12885,63 +12885,63 @@ define void @global_atomic_fmaximum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fmaximum_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB244_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB244_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB244_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[8:9], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB244_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12974,14 +12974,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB245_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -13010,14 +13010,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB245_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -13033,63 +13033,63 @@ define void @global_atomic_fminimum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fminimum_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB246_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB246_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB246_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[8:9], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB246_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 3c316f4acedb7..89c3e8d615efd 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -381,17 +381,17 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX90A-LABEL: no_unsafe: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -836,7 +836,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_getpc_b64 s[4:5] ; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v2, s[4:5] +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v2, s[4:5] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_branch .LBB5_7 ; GFX90A-NEXT: .LBB5_6: ; %Flow @@ -846,7 +846,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: .LBB5_7: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX90A-NEXT: s_getpc_b64 s[6:7] ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 @@ -856,6 +855,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_7 ; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end1 @@ -926,7 +926,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_getpc_b64 s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[0:1] +; GFX942-NEXT: global_load_dwordx2 v[4:5], v2, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_branch .LBB5_7 ; GFX942-NEXT: .LBB5_6: ; %Flow @@ -936,7 +936,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: .LBB5_7: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX942-NEXT: s_getpc_b64 s[2:3] ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 @@ -946,6 +945,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_7 ; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end1 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 37040123ee20c..42f76c4a10d2a 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 ; CHECK-NEXT: v_mov_b32_e32 v20, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 @@ -16,12 +16,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 ; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 ; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6 ; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f ; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90 @@ -37,8 +35,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883 ; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136 +; CHECK-NEXT: v_mov_b32_e32 v16, 0x57b87036 +; CHECK-NEXT: v_mov_b32_e32 v17, 0x3fb3b136 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523 ; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555 @@ -64,10 +62,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25] -; CHECK-NEXT: v_accvgpr_read_b32 v27, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a2 +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[0:1] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3] -; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[0:1] ; CHECK-NEXT: v_accvgpr_write_b32 a0, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -85,9 +81,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[14:15] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: v_fmac_f64_e32 v[16:17], 0, v[28:29] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19] -; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[16:17] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[16:17] +; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[18:19] +; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 @@ -96,8 +93,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 ; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a0 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_mov_b64 vcc, s[2:3] @@ -112,9 +109,9 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 @@ -132,13 +129,13 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cbranch_vccz .LBB0_13 ; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[28:29] +; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[26:27] ; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] ; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17] -; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v17, v16 +; CHECK-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[8:9] +; CHECK-NEXT: v_mov_b32_e32 v27, v26 ; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] -; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13] +; CHECK-NEXT: global_store_dwordx2 v20, v[26:27], s[12:13] ; CHECK-NEXT: s_cselect_b32 s23, s23, 0 ; CHECK-NEXT: s_cselect_b32 s22, s22, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll index b80aa9324e616..4909fac4f307d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll @@ -18,7 +18,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -53,7 +53,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -89,7 +89,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -127,7 +127,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -170,7 +170,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -217,7 +217,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -265,7 +265,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -315,7 +315,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll index 2ce54f8a463c7..30536b18674b5 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll @@ -108,7 +108,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 @@ -136,7 +136,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 @@ -164,7 +164,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 @@ -194,7 +194,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 @@ -340,7 +340,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 @@ -374,7 +374,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 @@ -408,7 +408,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 @@ -444,7 +444,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 9c03c850c8242..0e86a1ac68119 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -421,19 +421,19 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1 +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB18_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end @@ -458,19 +458,19 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1 +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB19_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 1a4a54b81c78f..5c4e25c3120e9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -4448,18 +4448,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4771,18 +4771,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5462,18 +5462,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 59b0537b817d2..53e9468c5d5b6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -34,24 +34,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -79,23 +79,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -177,24 +177,24 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -224,23 +224,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -324,21 +324,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -347,6 +344,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -386,20 +384,20 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -950,24 +948,24 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -997,25 +995,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1252,24 +1250,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1331,23 +1329,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1445,24 +1443,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1490,23 +1488,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1592,24 +1590,24 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1637,23 +1635,23 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1735,24 +1733,24 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1782,23 +1780,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1882,21 +1880,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -1905,6 +1900,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1944,20 +1940,20 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2508,24 +2504,24 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2555,25 +2551,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5415,54 +5411,50 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_6 -; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB24_2 -; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index c9c9f332fe391..5ee3ff67aa8a0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -34,24 +34,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -79,23 +79,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -177,24 +177,24 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -224,23 +224,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -324,21 +324,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -347,6 +344,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -386,20 +384,20 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -950,24 +948,24 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -997,25 +995,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1252,24 +1250,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1331,23 +1329,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1445,24 +1443,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1490,23 +1488,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1592,24 +1590,24 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1637,23 +1635,23 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1735,24 +1733,24 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1782,23 +1780,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1882,21 +1880,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -1905,6 +1900,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1944,20 +1940,20 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2508,24 +2504,24 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2555,25 +2551,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -5415,54 +5411,50 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_6 -; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB24_2 -; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 587c2ea885077..0e563c26d27ea 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -49,12 +49,11 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 @@ -62,6 +61,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -122,18 +122,18 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -245,12 +245,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 @@ -258,6 +257,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -319,18 +319,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -445,27 +445,25 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -533,18 +531,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1254,12 +1252,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 @@ -1267,6 +1264,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1328,12 +1326,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc @@ -1342,6 +1339,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1656,12 +1654,11 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 @@ -1669,6 +1666,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1729,18 +1727,18 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1852,12 +1850,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 @@ -1865,6 +1862,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1926,18 +1924,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2052,27 +2050,25 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2140,18 +2136,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2861,12 +2857,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 @@ -2874,6 +2869,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2935,12 +2931,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc @@ -2949,6 +2944,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3295,12 +3291,11 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB16_4 ; GFX942-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 @@ -3308,6 +3303,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB16_2 ; GFX942-NEXT: ; %bb.3: ; %Flow @@ -3433,51 +3429,47 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_6 -; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB16_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB16_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB16_2 -; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB16_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3713,8 +3705,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, s[2:3] +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v9 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -3726,31 +3718,31 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB17_3: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] -; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB17_4 ; GFX942-NEXT: ; %bb.5: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB17_2 ; GFX942-NEXT: .LBB17_6: ; %atomicrmw.private -; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] @@ -3870,10 +3862,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0x7f8, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v9 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3885,29 +3877,29 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 ; GFX90A-NEXT: ; %bb.5: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB17_2 ; GFX90A-NEXT: .LBB17_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4160,8 +4152,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: s_movk_i32 s2, 0xf800 ; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, s[2:3] +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v9 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4173,31 +4165,31 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] -; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 ; GFX942-NEXT: ; %bb.5: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB18_2 ; GFX942-NEXT: .LBB18_6: ; %atomicrmw.private -; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] @@ -4317,10 +4309,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v9 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4332,29 +4324,29 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: .LBB18_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 357234080235a..8a4b2c428e31a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -12687,20 +12687,19 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12812,20 +12811,19 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index bd9fe397bfc68..aede3928d7b0f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -46,7 +46,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] ; GFX90A-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.1 ; GFX90A-NEXT: {{ $}} @@ -76,12 +76,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_dpp6]] ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} @@ -89,28 +90,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: successors: %bb.4(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 - ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY10]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX90A-NEXT: S_BRANCH %bb.4 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 + ; GFX90A-NEXT: [[PHI:%[0-9]+]]:av_32 = PHI [[DEF]], %bb.0, %8, %bb.4 ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4 (%ir-block.35): ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[COPY11]], %bb.2 ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX90A-NEXT: early-clobber %48:vgpr_32 = STRICT_WWM [[COPY8]], implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %48, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]] + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY13]], [[COPY12]], implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[V_CNDMASK_B32_e64_]] ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5 (%ir-block.41): @@ -128,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] ; GFX942-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX942-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[DEF:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX942-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.1 ; GFX942-NEXT: {{ $}} @@ -158,12 +161,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX942-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_dpp6]] ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 ; GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX942-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX942-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX942-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX942-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX942-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.2 ; GFX942-NEXT: {{ $}} @@ -171,28 +175,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: successors: %bb.4(0x80000000) ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 - ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY10]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX942-NEXT: S_BRANCH %bb.4 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.3.Flow: ; GFX942-NEXT: successors: %bb.5(0x80000000) ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 + ; GFX942-NEXT: [[PHI:%[0-9]+]]:av_32 = PHI [[DEF]], %bb.0, %8, %bb.4 ; GFX942-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.5 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.4 (%ir-block.35): ; GFX942-NEXT: successors: %bb.3(0x80000000) ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[COPY11]], %bb.2 ; GFX942-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX942-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec - ; GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] - ; GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX942-NEXT: early-clobber %47:vgpr_32 = STRICT_WWM [[COPY8]], implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY12:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]] + ; GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY13]], [[COPY12]], implicit $exec + ; GFX942-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[V_CNDMASK_B32_e64_]] ; GFX942-NEXT: S_BRANCH %bb.3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.5 (%ir-block.41): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 02884559bdaa9..3d0ebc72791bd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -38,33 +38,35 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY6]], 0, 0, implicit $exec :: (load (s64) from %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.atomicrmw.start: ; GFX90A-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.0, %3, %bb.1 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY7]], %bb.0, %3, %bb.1 ; GFX90A-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI1]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 - ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] - ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY11]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY12]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U64_e64 [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], [[PHI1]], implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] ; GFX90A-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U64_e64_]], [[PHI]], implicit-def dead $scc ; GFX90A-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.atomicrmw.end: - ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], %bb.1 + ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY13]], %bb.1 ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec - ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec + ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index a50791e10f5a2..1978e68fdae9c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -90,18 +90,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -287,18 +287,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -486,18 +486,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1283,12 +1283,11 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -1297,6 +1296,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1697,18 +1697,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1912,18 +1912,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2319,18 +2319,18 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2719,18 +2719,18 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3712,18 +3712,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4130,18 +4130,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4532,18 +4532,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6724,18 +6724,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 311faac1b7c29..dc995fb7ef79c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -35,24 +35,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -80,23 +80,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -197,24 +197,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -242,23 +242,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -361,24 +361,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -406,23 +406,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1000,24 +1000,24 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1045,25 +1045,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1328,24 +1328,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1407,23 +1407,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1559,24 +1559,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1604,23 +1604,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1725,24 +1725,24 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1770,23 +1770,23 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1887,24 +1887,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1932,23 +1932,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2051,24 +2051,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2096,23 +2096,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2690,24 +2690,24 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2735,25 +2735,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -4122,24 +4122,24 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index e2808ee9bf706..f62e13a9d4341 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -35,24 +35,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -80,23 +80,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -197,24 +197,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -242,23 +242,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -361,24 +361,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -406,23 +406,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1000,24 +1000,24 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1045,25 +1045,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1328,24 +1328,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1407,23 +1407,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1559,24 +1559,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1604,23 +1604,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1725,24 +1725,24 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1770,23 +1770,23 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1887,24 +1887,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1932,23 +1932,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2051,24 +2051,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2096,23 +2096,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2690,24 +2690,24 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2735,25 +2735,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -4122,24 +4122,24 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 11f0f38d2b6fa..9e6f0fd7f13b5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -50,12 +50,11 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 @@ -63,6 +62,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -123,18 +123,18 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -282,12 +282,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 @@ -295,6 +294,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -355,18 +355,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -516,12 +516,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 @@ -529,6 +528,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -589,18 +589,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1438,12 +1438,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 @@ -1451,6 +1450,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1511,12 +1511,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -1525,6 +1524,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1905,12 +1905,11 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 @@ -1918,6 +1917,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1978,18 +1978,18 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2137,12 +2137,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 @@ -2150,6 +2149,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2210,18 +2210,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2371,12 +2371,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 @@ -2384,6 +2383,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2444,18 +2444,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3293,12 +3293,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 @@ -3306,6 +3305,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3366,12 +3366,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -3380,6 +3379,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3760,12 +3760,11 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 @@ -3773,6 +3772,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3836,18 +3836,18 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4012,12 +4012,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0 @@ -4025,6 +4024,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4088,18 +4088,18 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4265,12 +4265,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0 @@ -4278,6 +4277,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4341,18 +4341,18 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index e330c72ba0fc4..fe432e9d7594d 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1376,47 +1376,46 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; ; GFX90A-LABEL: test_mfma_loop_sgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 @@ -1438,47 +1437,46 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; ; GFX942-LABEL: test_mfma_loop_sgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 @@ -1643,13 +1641,13 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1679,7 +1677,6 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader @@ -1706,13 +1703,13 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1742,7 +1739,6 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader @@ -2401,7 +2397,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2471,7 +2466,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB9_2 Depth 2 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index be1788c6ec83f..2462414992e36 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,15 +6,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -22,12 +22,14 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_ashr_i32 s5, s3, 31 ; GFX942-NEXT: s_mov_b32 s3, s2 ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 ; GFX942-NEXT: s_and_b32 s3, s5, s4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] +; GFX942-NEXT: s_nop 6 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a2 ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -35,7 +37,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: ; GFX942-NEXT: ; implicit-def: $sgpr3 -; GFX942-NEXT: ; implicit-def: $agpr2 +; GFX942-NEXT: ; implicit-def: $vgpr0 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ;