diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 962c276bc2123..66d1126eb4151 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -709,7 +709,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { // 16-bit SGPRs instead of 32-bit ones. if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) Old.setSubReg(AMDGPU::NoSubRegister); - Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); + if (New->getReg().isPhysical()) + Old.substPhysReg(New->getReg(), *TRI); + else + Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); Old.setIsUndef(New->isUndef()); return true; } @@ -1986,7 +1989,9 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( if (!FoldingImm && !OpToFold.isReg()) return false; - if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) + // Fold virtual registers and constant physical registers. + if (OpToFold.isReg() && OpToFold.getReg().isPhysical() && + !TRI->isConstantPhysReg(OpToFold.getReg())) return false; // Prevent folding operands backwards in the function. For example, diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll index 4b6375cc60800..b4b49e90dca02 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll @@ -9,15 +9,14 @@ target triple = "amdgcn-amd-amdhsa" define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0 -; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s2, -1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0 +; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s0, -1 ; GFX1250-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1] ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1 ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo @@ -27,20 +26,20 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, -1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0 -; GFX1250-GISEL-NEXT: s_and_b32 s0, 1, s0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_and_b32 s1, 1, s1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo -; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo ; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS @@ -56,27 +55,24 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1] ; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 20, v2 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo ; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v3 scope:SCOPE_SYS @@ -91,10 +87,9 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) { ; GFX1250-LABEL: use_flat_to_private_addrspacecast: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS @@ -110,9 +105,8 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: s_endpgm @@ -122,9 +116,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 5fc9f4a0f8038..817e3f01c8cdd 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -534,58 +534,61 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB34_6 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: s_cbranch_execnz .LBB34_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow2 +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB34_8 +; GFX1250-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB34_3: ; %atomicrmw.check.private +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB34_3 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execz .LBB34_5 +; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB34_3: ; %Flow +; GFX1250-NEXT: .LBB34_5: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB34_5 -; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB34_7 +; GFX1250-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE -; GFX1250-NEXT: .LBB34_5: ; %Flow1 +; GFX1250-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE +; GFX1250-NEXT: .LBB34_7: ; %Flow1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB34_6: ; %Flow2 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB34_8 -; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-NEXT: s_cbranch_execz .LBB34_2 +; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.shared +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3] -; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.phi +; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo +; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr %ptr, double %val monotonic ret double %result @@ -596,58 +599,61 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB35_6 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: s_cbranch_execnz .LBB35_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow2 +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB35_8 +; GFX1250-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB35_3: ; %atomicrmw.check.private +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB35_3 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execz .LBB35_5 +; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB35_3: ; %Flow +; GFX1250-NEXT: .LBB35_5: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB35_5 -; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB35_7 +; GFX1250-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE -; GFX1250-NEXT: .LBB35_5: ; %Flow1 +; GFX1250-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE +; GFX1250-NEXT: .LBB35_7: ; %Flow1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB35_6: ; %Flow2 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB35_8 -; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-NEXT: s_cbranch_execz .LBB35_2 +; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.shared +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3] -; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.phi +; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo +; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -682,40 +688,42 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB38_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB38_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB38_4 +; GFX1250-NEXT: .LBB38_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB38_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB38_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB38_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB38_2 +; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE -; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val monotonic ret double %result @@ -726,40 +734,42 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB39_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB39_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB39_4 +; GFX1250-NEXT: .LBB39_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB39_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB39_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB39_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB39_2 +; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE -; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -794,40 +804,42 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB42_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB42_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB42_4 +; GFX1250-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB42_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB42_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB42_2 +; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE -; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val monotonic ret double %result @@ -838,40 +850,42 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB43_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB43_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB43_4 +; GFX1250-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB43_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB43_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB43_2 +; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE -; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -978,13 +992,11 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -996,10 +1008,9 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB52_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1021,13 +1032,11 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1039,10 +1048,9 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB53_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1064,13 +1072,11 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB54_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1082,10 +1088,9 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB54_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1107,13 +1112,11 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB55_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1125,10 +1128,9 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB55_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1150,13 +1152,11 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB56_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1168,10 +1168,9 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB56_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1193,13 +1192,11 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB57_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1211,10 +1208,9 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB57_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1236,13 +1232,11 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1254,10 +1248,9 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB58_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1279,13 +1272,11 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1297,10 +1288,9 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB59_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 004d3c0c1cf53..265848b441f69 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -252,11 +252,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -279,9 +278,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -297,12 +295,11 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -325,9 +322,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 @@ -354,13 +350,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -379,9 +374,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -397,7 +391,6 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -405,7 +398,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -428,9 +421,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 @@ -454,11 +446,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -478,9 +469,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE @@ -490,13 +480,12 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3 @@ -515,9 +504,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE @@ -537,11 +525,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3 @@ -560,9 +546,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE @@ -572,16 +557,15 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3 @@ -600,9 +584,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE @@ -680,11 +663,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -707,9 +689,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -725,12 +706,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -753,9 +733,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -782,13 +761,12 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -807,9 +785,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -825,7 +802,6 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -833,7 +809,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -856,9 +832,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -882,11 +857,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -906,9 +880,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -921,13 +894,12 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3 @@ -946,9 +918,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -971,11 +942,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3 @@ -994,9 +963,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1009,16 +977,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3 @@ -1037,9 +1004,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -1120,11 +1086,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1147,9 +1112,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1165,12 +1129,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1193,9 +1156,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -1222,13 +1184,12 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1247,9 +1208,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1265,7 +1225,6 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -1273,7 +1232,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1296,9 +1255,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -1322,11 +1280,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -1346,9 +1303,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1361,13 +1317,12 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3 @@ -1386,9 +1341,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -1411,11 +1365,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3 @@ -1434,9 +1386,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1449,16 +1400,15 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3 @@ -1477,9 +1427,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -1560,11 +1509,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1587,9 +1535,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1606,12 +1553,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1634,9 +1580,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -1664,13 +1609,12 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1689,9 +1633,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1708,7 +1651,6 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -1716,7 +1658,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1739,9 +1681,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -1766,11 +1707,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -1790,9 +1730,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1806,13 +1745,12 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3 @@ -1831,9 +1769,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -1857,11 +1794,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3 @@ -1880,9 +1815,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1896,16 +1830,15 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3 @@ -1924,9 +1857,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -2008,11 +1940,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2035,9 +1966,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2054,12 +1984,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2082,9 +2011,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -2112,13 +2040,12 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2137,9 +2064,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2156,7 +2082,6 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -2164,7 +2089,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2187,9 +2112,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -2214,11 +2138,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2238,9 +2161,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2254,13 +2176,12 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3 @@ -2279,9 +2200,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -2305,11 +2225,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3 @@ -2328,9 +2246,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2344,16 +2261,15 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3 @@ -2372,9 +2288,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -2456,11 +2371,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2483,9 +2397,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2502,12 +2415,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2530,9 +2442,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -2560,13 +2471,12 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2585,9 +2495,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2604,7 +2513,6 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -2612,7 +2520,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2635,9 +2543,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -2662,11 +2569,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2686,9 +2592,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2702,13 +2607,12 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3 @@ -2727,9 +2631,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -2753,11 +2656,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3 @@ -2776,9 +2677,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2792,16 +2692,15 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3 @@ -2820,9 +2719,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -2898,11 +2796,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2925,10 +2822,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2944,12 +2840,11 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2972,10 +2867,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3002,13 +2896,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3027,10 +2920,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3046,7 +2938,6 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3054,7 +2945,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3077,10 +2968,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3104,11 +2994,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -3127,9 +3016,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3142,13 +3030,12 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3 @@ -3166,9 +3053,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -3191,11 +3077,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3 @@ -3213,9 +3097,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3228,16 +3111,15 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3 @@ -3255,9 +3137,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -3332,11 +3213,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3359,10 +3239,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3378,12 +3257,11 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3406,10 +3284,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3436,13 +3313,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3461,10 +3337,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3480,7 +3355,6 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3488,7 +3362,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3511,10 +3385,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3538,11 +3411,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -3561,9 +3433,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3576,13 +3447,12 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3 @@ -3600,9 +3470,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -3625,11 +3494,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3 @@ -3647,9 +3514,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3662,16 +3528,15 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3 @@ -3689,9 +3554,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -3766,11 +3630,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3793,10 +3656,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3812,12 +3674,11 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3840,10 +3701,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3870,13 +3730,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3895,10 +3754,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3914,7 +3772,6 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3922,7 +3779,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3945,10 +3802,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -3972,11 +3828,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -3995,9 +3850,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4010,13 +3864,12 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3 @@ -4034,9 +3887,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -4059,11 +3911,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3 @@ -4081,9 +3931,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4096,16 +3945,15 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3 @@ -4123,9 +3971,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -4200,11 +4047,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4227,10 +4073,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4246,12 +4091,11 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4274,10 +4118,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -4304,13 +4147,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4329,10 +4171,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4348,7 +4189,6 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4356,7 +4196,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4379,10 +4219,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -4406,11 +4245,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -4429,9 +4267,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4444,13 +4281,12 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3 @@ -4468,9 +4304,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -4493,11 +4328,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3 @@ -4515,9 +4348,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4530,16 +4362,15 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3 @@ -4557,9 +4388,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -4654,12 +4484,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4684,9 +4513,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off @@ -4704,12 +4532,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4734,9 +4561,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4766,13 +4592,12 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4793,9 +4618,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off @@ -4813,7 +4637,6 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4821,7 +4644,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4846,9 +4669,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4873,13 +4695,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -4901,9 +4722,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off @@ -4918,13 +4738,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3 @@ -4945,9 +4764,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -4972,11 +4790,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3 @@ -4997,9 +4813,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off @@ -5014,16 +4829,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3 @@ -5044,9 +4858,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -5120,11 +4933,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5146,10 +4958,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5167,12 +4978,11 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5194,10 +5004,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5228,13 +5037,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5252,10 +5060,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5273,7 +5080,6 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -5281,7 +5087,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5303,10 +5109,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5334,11 +5139,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -5356,9 +5160,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5373,13 +5176,12 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3 @@ -5396,9 +5198,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5425,11 +5226,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3 @@ -5446,9 +5245,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5463,16 +5261,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3 @@ -5489,9 +5286,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5569,11 +5365,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5595,10 +5390,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5619,12 +5413,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5646,10 +5439,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5681,13 +5473,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5705,10 +5496,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5729,7 +5519,6 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -5737,7 +5526,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5759,10 +5548,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off @@ -5791,11 +5579,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -5813,9 +5600,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5833,13 +5619,12 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3 @@ -5856,9 +5641,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -5886,11 +5670,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3 @@ -5907,9 +5689,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5927,16 +5708,15 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3 @@ -5953,9 +5733,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index 725d57d852966..788cdd1c89051 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -337,11 +337,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1] -; SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -363,10 +361,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; SDAG-NEXT: s_cbranch_execz .LBB21_2 ; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private -; SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -380,7 +377,6 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -390,7 +386,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0 ; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_xor_b32_e32 v0, s2, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -412,10 +408,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 ; GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off