From d9321dd183919007a5430d758360c8470f9c7818 Mon Sep 17 00:00:00 2001 From: John Lu Date: Wed, 22 Oct 2025 12:59:54 -0500 Subject: [PATCH 01/15] Delete redundant s_or_b32 Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 21 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 941 +++++++++--------- .../test/CodeGen/AMDGPU/carryout-selection.ll | 4 - .../expand-scalar-carry-out-select-user.ll | 10 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 368 ++++--- llvm/test/CodeGen/AMDGPU/srem64.ll | 410 ++++---- llvm/test/CodeGen/AMDGPU/uaddo.ll | 6 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 199 ++-- llvm/test/CodeGen/AMDGPU/urem64.ll | 296 +++--- llvm/test/CodeGen/AMDGPU/usubo.ll | 6 +- 11 files changed, 1058 insertions(+), 1205 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d9f76c9a59d00..305c9c40ab726 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10160,7 +10160,7 @@ static bool followSubRegDef(MachineInstr &MI, } MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { assert(MRI.isSSA()); if (!P.Reg.isVirtual()) return nullptr; @@ -10689,6 +10689,25 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!optimizeSCC(Def, &CmpInstr, RI)) return false; + // If s_or_32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of + // a register pair) and the input is a 64-bit foldableSelect then transform: + // + // (s_or_b32 (S_CSELECT_B64 (non-zero imm), 0), 0 => (S_CSELECT_B64 + // (non-zero + // imm), 0) + if (Def->getOpcode() == AMDGPU::S_OR_B32 && + MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { + MachineOperand OrOpnd1 = Def->getOperand(1); + MachineOperand OrOpnd2 = Def->getOperand(2); + + if (OrOpnd1.isReg() && OrOpnd2.isReg() && + OrOpnd1.getReg() != OrOpnd2.getReg()) { + auto *Def1 = getVRegSubRegDef(getRegSubRegPair(OrOpnd1), *MRI); + auto *Def2 = getVRegSubRegDef(getRegSubRegPair(OrOpnd2), *MRI); + if (Def1 == Def2 && foldableSelect(Def1)) + optimizeSCC(Def1, Def); + } + } return true; }; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dc23a21f959ce..c4d0678c0f989 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1687,7 +1687,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, /// skipping copy like instructions and subreg-manipulation pseudos. /// Following another subreg of a reg:subreg isn't supported. MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI); + const MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 51df8c34cc55e..54b1554ae5d04 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7772,7 +7772,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX6-NEXT: s_ashr_i32 s8, s1, 31 @@ -7782,8 +7781,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX6-NEXT: s_sub_u32 s12, 0, s10 -; GFX6-NEXT: s_subb_u32 s13, 0, s11 +; GFX6-NEXT: s_sub_u32 s0, 0, s10 +; GFX6-NEXT: s_subb_u32 s1, 0, s11 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -7792,128 +7791,121 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_mul_i32 s15, s13, s0 -; GFX6-NEXT: s_mul_i32 s16, s12, s0 -; GFX6-NEXT: s_add_i32 s1, s17, s1 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16 -; GFX6-NEXT: s_add_i32 s1, s1, s15 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16 -; GFX6-NEXT: v_readfirstlane_b32 s15, v3 -; GFX6-NEXT: s_mul_i32 s17, s0, s1 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1 -; GFX6-NEXT: s_add_u32 s15, s15, s17 -; GFX6-NEXT: v_readfirstlane_b32 s17, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s17 -; GFX6-NEXT: s_mul_i32 s16, s14, s16 -; GFX6-NEXT: v_readfirstlane_b32 s18, v4 -; GFX6-NEXT: s_add_u32 s15, s15, s16 -; GFX6-NEXT: s_addc_u32 s15, s17, s18 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: s_addc_u32 s16, s16, 0 -; GFX6-NEXT: s_mul_i32 s1, s14, s1 -; GFX6-NEXT: s_add_u32 s1, s15, s1 -; GFX6-NEXT: s_addc_u32 s15, 0, s16 -; GFX6-NEXT: s_add_u32 s16, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s14, s14, s15 -; GFX6-NEXT: s_mul_i32 s0, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_add_i32 s0, s1, s0 -; GFX6-NEXT: s_mul_i32 s13, s13, s16 -; GFX6-NEXT: s_mul_i32 s1, s12, s16 -; GFX6-NEXT: s_add_i32 s0, s0, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s0 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 -; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_mul_i32 s1, s14, s1 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s1, s13, s1 -; GFX6-NEXT: s_addc_u32 s1, s15, s12 +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 -; GFX6-NEXT: s_mul_i32 s0, s14, s0 -; GFX6-NEXT: s_add_u32 s0, s1, s0 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s15, s16, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s14, s14, s12 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s13, s0, s12 +; GFX6-NEXT: v_readfirstlane_b32 s16, v2 +; GFX6-NEXT: s_mul_i32 s14, s1, s2 +; GFX6-NEXT: s_mul_i32 s15, s0, s2 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s15 +; GFX6-NEXT: s_add_i32 s13, s13, s14 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s15 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_mul_i32 s16, s2, s13 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_add_u32 s14, s14, s16 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s15, s12, s15 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s17, v4 +; GFX6-NEXT: s_add_u32 s14, s14, s15 +; GFX6-NEXT: s_addc_u32 s14, s16, s17 +; GFX6-NEXT: v_readfirstlane_b32 s15, v1 +; GFX6-NEXT: s_addc_u32 s15, s15, 0 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: s_add_u32 s13, s14, s13 +; GFX6-NEXT: s_addc_u32 s14, 0, s15 +; GFX6-NEXT: s_add_u32 s13, s2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, s14 +; GFX6-NEXT: s_mul_i32 s14, s0, s12 +; GFX6-NEXT: s_mul_i32 s1, s1, s13 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_add_i32 s14, s15, s14 +; GFX6-NEXT: s_mul_i32 s0, s0, s13 +; GFX6-NEXT: s_add_i32 s1, s14, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX6-NEXT: s_mul_i32 s15, s13, s1 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s15, s17, s15 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s0, s12, s0 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_add_u32 s0, s15, s0 +; GFX6-NEXT: s_addc_u32 s0, s16, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s1, s12, s1 +; GFX6-NEXT: s_add_u32 s0, s0, s1 +; GFX6-NEXT: s_addc_u32 s1, 0, s14 +; GFX6-NEXT: s_add_u32 s14, s13, s0 +; GFX6-NEXT: s_addc_u32 s15, s12, s1 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: s_addc_u32 s1, s7, s12 ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2 -; GFX6-NEXT: s_mul_i32 s1, s6, s14 +; GFX6-NEXT: s_mul_i32 s1, s6, s15 ; GFX6-NEXT: v_readfirstlane_b32 s16, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: s_add_u32 s1, s16, s1 ; GFX6-NEXT: s_addc_u32 s4, 0, s4 -; GFX6-NEXT: s_mul_i32 s15, s7, s15 +; GFX6-NEXT: s_mul_i32 s14, s7, s14 ; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: s_add_u32 s1, s1, s15 +; GFX6-NEXT: s_add_u32 s1, s1, s14 ; GFX6-NEXT: s_addc_u32 s1, s4, s16 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 -; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s16, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_mul_i32 s14, s7, s15 +; GFX6-NEXT: s_add_u32 s14, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s4 +; GFX6-NEXT: s_addc_u32 s15, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s17 +; GFX6-NEXT: s_mul_i32 s4, s10, s15 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s16 -; GFX6-NEXT: s_add_i32 s18, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s18 -; GFX6-NEXT: s_mul_i32 s4, s10, s16 +; GFX6-NEXT: s_mul_i32 s5, s11, s14 +; GFX6-NEXT: s_add_i32 s16, s4, s5 +; GFX6-NEXT: s_sub_i32 s17, s7, s16 +; GFX6-NEXT: s_mul_i32 s4, s10, s14 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s4, s5 -; GFX6-NEXT: s_subb_u32 s19, s14, s11 -; GFX6-NEXT: s_sub_u32 s20, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s20, s10 +; GFX6-NEXT: s_subb_u32 s17, s17, s11 +; GFX6-NEXT: s_sub_u32 s18, s6, s10 +; GFX6-NEXT: s_subb_u32 s17, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s11 ; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s20, s16, 2 -; GFX6-NEXT: s_addc_u32 s21, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s20, s15 -; GFX6-NEXT: s_cselect_b32 s15, s21, s19 +; GFX6-NEXT: s_cmp_ge_u32 s18, s10 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s17, s11 +; GFX6-NEXT: s_cselect_b32 s17, s18, s19 +; GFX6-NEXT: s_add_u32 s18, s14, 1 +; GFX6-NEXT: s_addc_u32 s19, s15, 0 +; GFX6-NEXT: s_add_u32 s20, s14, 2 +; GFX6-NEXT: s_addc_u32 s21, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s17, s20, s18 +; GFX6-NEXT: s_cselect_b32 s18, s21, s19 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_subb_u32 s4, s7, s18 +; GFX6-NEXT: s_subb_u32 s4, s7, s16 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s6, s10 @@ -7921,13 +7913,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_cmp_eq_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s4, s6, s5 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s5, s15, s17 -; GFX6-NEXT: s_cselect_b32 s4, s14, s16 +; GFX6-NEXT: s_cselect_b32 s5, s18, s15 +; GFX6-NEXT: s_cselect_b32 s4, s17, s14 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 ; GFX6-NEXT: s_subb_u32 s5, s5, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -8278,8 +8271,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_u32 s14, 0, s6 -; GFX6-NEXT: s_subb_u32 s15, 0, s7 +; GFX6-NEXT: s_sub_u32 s12, 0, s6 +; GFX6-NEXT: s_subb_u32 s13, 0, s7 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8288,69 +8281,65 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0 -; GFX6-NEXT: v_readfirstlane_b32 s16, v1 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 -; GFX6-NEXT: s_mul_i32 s13, s14, s16 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_mul_i32 s16, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s19, v2 -; GFX6-NEXT: s_mul_i32 s17, s15, s12 -; GFX6-NEXT: s_mul_i32 s18, s14, s12 -; GFX6-NEXT: s_add_i32 s13, s19, s13 +; GFX6-NEXT: s_mul_i32 s17, s13, s15 +; GFX6-NEXT: s_mul_i32 s18, s12, s15 +; GFX6-NEXT: s_add_i32 s16, s19, s16 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s18 -; GFX6-NEXT: s_add_i32 s13, s13, s17 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: s_add_i32 s16, s16, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, s18 ; GFX6-NEXT: v_readfirstlane_b32 s17, v3 -; GFX6-NEXT: s_mul_i32 s20, s12, s13 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_mul_i32 s20, s15, s16 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s16 ; GFX6-NEXT: s_add_u32 s17, s17, s20 ; GFX6-NEXT: v_readfirstlane_b32 s20, v0 -; GFX6-NEXT: s_mul_i32 s18, s16, s18 +; GFX6-NEXT: s_mul_i32 s18, s14, s18 ; GFX6-NEXT: s_addc_u32 s20, 0, s20 ; GFX6-NEXT: v_readfirstlane_b32 s19, v4 ; GFX6-NEXT: s_add_u32 s17, s17, s18 ; GFX6-NEXT: s_addc_u32 s17, s20, s19 ; GFX6-NEXT: v_readfirstlane_b32 s18, v1 ; GFX6-NEXT: s_addc_u32 s18, s18, 0 -; GFX6-NEXT: s_mul_i32 s13, s16, s13 -; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: s_mul_i32 s16, s14, s16 +; GFX6-NEXT: s_add_u32 s16, s17, s16 ; GFX6-NEXT: s_addc_u32 s17, 0, s18 -; GFX6-NEXT: s_add_u32 s18, s12, s13 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_addc_u32 s16, s16, s17 -; GFX6-NEXT: s_mul_i32 s12, s14, s16 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s15, s15, s18 -; GFX6-NEXT: s_mul_i32 s13, s14, s18 -; GFX6-NEXT: s_add_i32 s12, s12, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mul_hi_u32 v3, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s18, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s18, v0 -; GFX6-NEXT: s_mul_i32 s15, s18, s12 -; GFX6-NEXT: v_readfirstlane_b32 s19, v2 -; GFX6-NEXT: s_add_u32 s15, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s15, s16 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_addc_u32 s14, s14, s17 +; GFX6-NEXT: s_mul_i32 s16, s12, s14 +; GFX6-NEXT: s_mul_i32 s13, s13, s15 ; GFX6-NEXT: v_readfirstlane_b32 s17, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s13 -; GFX6-NEXT: s_addc_u32 s17, 0, s17 -; GFX6-NEXT: v_readfirstlane_b32 s14, v3 -; GFX6-NEXT: s_add_u32 s13, s15, s13 -; GFX6-NEXT: s_addc_u32 s13, s17, s14 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_addc_u32 s14, s14, 0 -; GFX6-NEXT: s_mul_i32 s12, s16, s12 -; GFX6-NEXT: s_add_u32 s12, s13, s12 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: s_add_u32 s15, s18, s12 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_addc_u32 s14, s16, s14 +; GFX6-NEXT: s_add_i32 s16, s17, s16 +; GFX6-NEXT: s_mul_i32 s12, s12, s15 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s15, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX6-NEXT: s_mul_i32 s17, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s19, v2 +; GFX6-NEXT: s_add_u32 s17, s19, s17 +; GFX6-NEXT: v_readfirstlane_b32 s18, v0 +; GFX6-NEXT: s_mul_i32 s12, s14, s12 +; GFX6-NEXT: s_addc_u32 s18, 0, s18 +; GFX6-NEXT: v_readfirstlane_b32 s16, v3 +; GFX6-NEXT: s_add_u32 s12, s17, s12 +; GFX6-NEXT: s_addc_u32 s12, s18, s16 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s13, s14, s13 +; GFX6-NEXT: s_add_u32 s12, s12, s13 +; GFX6-NEXT: s_addc_u32 s13, 0, s16 +; GFX6-NEXT: s_add_u32 s15, s15, s12 +; GFX6-NEXT: s_addc_u32 s14, s14, s13 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 @@ -8374,40 +8363,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s18, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: s_add_u32 s17, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s17 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s19 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s16 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s18 -; GFX6-NEXT: s_add_i32 s20, s14, s15 -; GFX6-NEXT: s_sub_i32 s16, s9, s20 -; GFX6-NEXT: s_mul_i32 s14, s6, s18 +; GFX6-NEXT: s_mul_i32 s15, s7, s17 +; GFX6-NEXT: s_add_i32 s18, s14, s15 +; GFX6-NEXT: s_sub_i32 s19, s9, s18 +; GFX6-NEXT: s_mul_i32 s14, s6, s17 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s14, s15 -; GFX6-NEXT: s_subb_u32 s21, s16, s7 -; GFX6-NEXT: s_sub_u32 s22, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s16, s17 -; GFX6-NEXT: s_subb_u32 s16, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s6 +; GFX6-NEXT: s_subb_u32 s19, s19, s7 +; GFX6-NEXT: s_sub_u32 s20, s8, s6 +; GFX6-NEXT: s_subb_u32 s19, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s16, s21, s17 -; GFX6-NEXT: s_add_u32 s17, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_cselect_b32 s16, s22, s17 -; GFX6-NEXT: s_cselect_b32 s17, s23, s21 +; GFX6-NEXT: s_cmp_ge_u32 s20, s6 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s7 +; GFX6-NEXT: s_cselect_b32 s19, s20, s21 +; GFX6-NEXT: s_add_u32 s20, s17, 1 +; GFX6-NEXT: s_addc_u32 s21, s16, 0 +; GFX6-NEXT: s_add_u32 s22, s17, 2 +; GFX6-NEXT: s_addc_u32 s23, s16, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s19, s22, s20 +; GFX6-NEXT: s_cselect_b32 s20, s23, s21 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s9, s9, s20 +; GFX6-NEXT: s_subb_u32 s9, s9, s18 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 ; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 @@ -8415,12 +8401,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 ; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s17, s19 -; GFX6-NEXT: s_cselect_b32 s6, s16, s18 +; GFX6-NEXT: s_cselect_b32 s7, s20, s16 +; GFX6-NEXT: s_cselect_b32 s6, s19, s17 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s16, s6, s2 -; GFX6-NEXT: s_subb_u32 s17, s7, s3 +; GFX6-NEXT: s_sub_u32 s14, s6, s2 +; GFX6-NEXT: s_subb_u32 s15, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8428,8 +8414,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_u32 s12, 0, s8 -; GFX6-NEXT: s_subb_u32 s13, 0, s9 +; GFX6-NEXT: s_sub_u32 s2, 0, s8 +; GFX6-NEXT: s_subb_u32 s3, 0, s9 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8438,128 +8424,121 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s0, s13, s2 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s12, s2 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-NEXT: s_mul_i32 s4, s2, s3 -; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s13, s2, s12 +; GFX6-NEXT: v_readfirstlane_b32 s16, v2 +; GFX6-NEXT: s_mul_i32 s1, s3, s0 +; GFX6-NEXT: s_add_i32 s13, s16, s13 +; GFX6-NEXT: s_add_i32 s13, s13, s1 +; GFX6-NEXT: s_mul_i32 s1, s2, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 +; GFX6-NEXT: s_mul_i32 s16, s0, s13 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 -; GFX6-NEXT: s_add_u32 s4, s18, s4 -; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_add_u32 s16, s18, s16 +; GFX6-NEXT: s_addc_u32 s17, 0, s17 +; GFX6-NEXT: s_mul_i32 s1, s12, s1 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 -; GFX6-NEXT: s_addc_u32 s4, s5, s18 -; GFX6-NEXT: v_readfirstlane_b32 s5, v1 -; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 -; GFX6-NEXT: s_add_u32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s4, 0, s5 -; GFX6-NEXT: s_add_u32 s5, s2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 -; GFX6-NEXT: s_mul_i32 s2, s12, s4 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_add_i32 s2, s3, s2 -; GFX6-NEXT: s_mul_i32 s13, s13, s5 -; GFX6-NEXT: s_mul_i32 s3, s12, s5 -; GFX6-NEXT: s_add_i32 s2, s2, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_add_u32 s1, s16, s1 +; GFX6-NEXT: s_addc_u32 s1, s17, s18 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: s_add_u32 s1, s1, s13 +; GFX6-NEXT: s_addc_u32 s13, 0, s16 +; GFX6-NEXT: s_add_u32 s16, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_addc_u32 s4, s12, s13 +; GFX6-NEXT: s_mul_i32 s5, s2, s4 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_add_i32 s5, s12, s5 +; GFX6-NEXT: s_mul_i32 s3, s3, s16 +; GFX6-NEXT: s_mul_i32 s2, s2, s16 +; GFX6-NEXT: s_add_i32 s3, s5, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s13, s15, s13 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 -; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s12 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 +; GFX6-NEXT: s_mul_i32 s12, s16, s3 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s12, s17, s12 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 -; GFX6-NEXT: s_add_u32 s2, s3, s2 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s13, s5, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s12, s4, s12 +; GFX6-NEXT: s_addc_u32 s13, 0, s13 +; GFX6-NEXT: v_readfirstlane_b32 s5, v3 +; GFX6-NEXT: s_add_u32 s2, s12, s2 +; GFX6-NEXT: s_addc_u32 s2, s13, s5 +; GFX6-NEXT: v_readfirstlane_b32 s5, v1 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_mul_i32 s3, s4, s3 +; GFX6-NEXT: s_add_u32 s2, s2, s3 +; GFX6-NEXT: s_addc_u32 s3, 0, s5 +; GFX6-NEXT: s_add_u32 s12, s16, s2 +; GFX6-NEXT: s_addc_u32 s13, s4, s3 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 -; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_mul_i32 s2, s10, s13 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v3 +; GFX6-NEXT: v_readfirstlane_b32 s17, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s15, s2 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 -; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s15, v1 -; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s14, s15 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_addc_u32 s13, s13, 0 +; GFX6-NEXT: s_add_u32 s2, s17, s2 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s18, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: s_add_u32 s2, s2, s12 +; GFX6-NEXT: s_addc_u32 s2, s16, s17 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s13, s11, s13 +; GFX6-NEXT: s_add_u32 s16, s2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s19 +; GFX6-NEXT: s_addc_u32 s17, 0, s12 +; GFX6-NEXT: s_mul_i32 s12, s8, s17 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s18 -; GFX6-NEXT: s_add_i32 s20, s12, s13 -; GFX6-NEXT: s_sub_i32 s14, s11, s20 -; GFX6-NEXT: s_mul_i32 s12, s8, s18 +; GFX6-NEXT: s_mul_i32 s13, s9, s16 +; GFX6-NEXT: s_add_i32 s18, s12, s13 +; GFX6-NEXT: s_sub_i32 s19, s11, s18 +; GFX6-NEXT: s_mul_i32 s12, s8, s16 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 -; GFX6-NEXT: s_subb_u32 s21, s14, s9 -; GFX6-NEXT: s_sub_u32 s22, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s8 +; GFX6-NEXT: s_subb_u32 s19, s19, s9 +; GFX6-NEXT: s_sub_u32 s20, s10, s8 +; GFX6-NEXT: s_subb_u32 s19, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s9 ; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_add_u32 s15, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s22, s15 -; GFX6-NEXT: s_cselect_b32 s15, s23, s21 +; GFX6-NEXT: s_cmp_ge_u32 s20, s8 +; GFX6-NEXT: s_cselect_b32 s20, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s19, s9 +; GFX6-NEXT: s_cselect_b32 s19, s20, s21 +; GFX6-NEXT: s_add_u32 s20, s16, 1 +; GFX6-NEXT: s_addc_u32 s21, s17, 0 +; GFX6-NEXT: s_add_u32 s22, s16, 2 +; GFX6-NEXT: s_addc_u32 s23, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s19, s22, s20 +; GFX6-NEXT: s_cselect_b32 s20, s23, s21 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s11, s11, s20 +; GFX6-NEXT: s_subb_u32 s11, s11, s18 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 ; GFX6-NEXT: s_cselect_b32 s12, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 @@ -8567,15 +8546,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s12 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s15, s19 -; GFX6-NEXT: s_cselect_b32 s8, s14, s18 +; GFX6-NEXT: s_cselect_b32 s9, s20, s17 +; GFX6-NEXT: s_cselect_b32 s8, s19, s16 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9015,105 +8994,100 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_u32 s10, 0, s8 -; GFX6-NEXT: s_subb_u32 s11, 0, s9 +; GFX6-NEXT: s_sub_u32 s0, 0, s8 +; GFX6-NEXT: s_subb_u32 s1, 0, s9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s10, v0 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s1, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_mul_i32 s13, s11, s0 -; GFX6-NEXT: s_mul_i32 s14, s10, s0 -; GFX6-NEXT: s_add_i32 s1, s15, s1 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s14 -; GFX6-NEXT: s_add_i32 s1, s1, s13 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, s14 -; GFX6-NEXT: v_readfirstlane_b32 s13, v3 -; GFX6-NEXT: s_mul_i32 s15, s0, s1 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1 -; GFX6-NEXT: s_add_u32 s13, s13, s15 -; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: s_mul_i32 s14, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s16, v4 -; GFX6-NEXT: s_add_u32 s13, s13, s14 -; GFX6-NEXT: s_addc_u32 s13, s15, s16 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_addc_u32 s14, s14, 0 -; GFX6-NEXT: s_mul_i32 s1, s12, s1 -; GFX6-NEXT: s_add_u32 s1, s13, s1 -; GFX6-NEXT: s_addc_u32 s13, 0, s14 -; GFX6-NEXT: s_add_u32 s14, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s12, s12, s13 -; GFX6-NEXT: s_mul_i32 s0, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_add_i32 s0, s1, s0 -; GFX6-NEXT: s_mul_i32 s11, s11, s14 -; GFX6-NEXT: s_mul_i32 s1, s10, s14 -; GFX6-NEXT: s_add_i32 s0, s0, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 -; GFX6-NEXT: s_mul_i32 s11, s14, s0 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s11, s15, s11 -; GFX6-NEXT: v_readfirstlane_b32 s13, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s1 -; GFX6-NEXT: s_addc_u32 s13, 0, s13 -; GFX6-NEXT: v_readfirstlane_b32 s10, v3 -; GFX6-NEXT: s_add_u32 s1, s11, s1 -; GFX6-NEXT: s_addc_u32 s1, s13, s10 +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 -; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s0, s12, s0 -; GFX6-NEXT: s_add_u32 s0, s1, s0 -; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_add_u32 s13, s14, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_addc_u32 s12, s12, s10 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s11, s0, s10 +; GFX6-NEXT: v_readfirstlane_b32 s14, v2 +; GFX6-NEXT: s_mul_i32 s12, s1, s2 +; GFX6-NEXT: s_mul_i32 s13, s0, s2 +; GFX6-NEXT: s_add_i32 s11, s14, s11 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s13 +; GFX6-NEXT: s_add_i32 s11, s11, s12 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s11 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s13 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_mul_i32 s14, s2, s11 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s11 +; GFX6-NEXT: s_add_u32 s12, s12, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_mul_i32 s13, s10, s13 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: v_readfirstlane_b32 s15, v4 +; GFX6-NEXT: s_add_u32 s12, s12, s13 +; GFX6-NEXT: s_addc_u32 s12, s14, s15 +; GFX6-NEXT: v_readfirstlane_b32 s13, v1 +; GFX6-NEXT: s_addc_u32 s13, s13, 0 +; GFX6-NEXT: s_mul_i32 s11, s10, s11 +; GFX6-NEXT: s_add_u32 s11, s12, s11 +; GFX6-NEXT: s_addc_u32 s12, 0, s13 +; GFX6-NEXT: s_add_u32 s11, s2, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: s_addc_u32 s10, s10, s12 +; GFX6-NEXT: s_mul_i32 s12, s0, s10 +; GFX6-NEXT: s_mul_i32 s1, s1, s11 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_add_i32 s12, s13, s12 +; GFX6-NEXT: s_mul_i32 s0, s0, s11 +; GFX6-NEXT: s_add_i32 s1, s12, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX6-NEXT: s_mul_i32 s13, s11, s1 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s13, s15, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_mul_i32 s0, s10, s0 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_add_u32 s0, s13, s0 +; GFX6-NEXT: s_addc_u32 s0, s14, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s1, s10, s1 +; GFX6-NEXT: s_add_u32 s0, s0, s1 +; GFX6-NEXT: s_addc_u32 s1, 0, s12 +; GFX6-NEXT: s_add_u32 s12, s11, s0 +; GFX6-NEXT: s_addc_u32 s13, s10, s1 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 ; GFX6-NEXT: s_mov_b32 s11, s10 ; GFX6-NEXT: s_addc_u32 s1, s7, s10 ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2 -; GFX6-NEXT: s_mul_i32 s1, s6, s12 +; GFX6-NEXT: s_mul_i32 s1, s6, s13 ; GFX6-NEXT: v_readfirstlane_b32 s14, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: s_add_u32 s1, s14, s1 ; GFX6-NEXT: s_addc_u32 s4, 0, s4 -; GFX6-NEXT: s_mul_i32 s13, s7, s13 +; GFX6-NEXT: s_mul_i32 s12, s7, s12 ; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: s_add_u32 s1, s1, s13 +; GFX6-NEXT: s_add_u32 s1, s1, s12 ; GFX6-NEXT: s_addc_u32 s1, s4, s14 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 -; GFX6-NEXT: s_mul_i32 s12, s7, s12 +; GFX6-NEXT: s_mul_i32 s12, s7, s13 ; GFX6-NEXT: s_add_u32 s12, s1, s12 ; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 @@ -9128,11 +9102,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s4, s5 ; GFX6-NEXT: s_subb_u32 s15, s13, s9 ; GFX6-NEXT: s_sub_u32 s16, s6, s8 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s12, s13 ; GFX6-NEXT: s_subb_u32 s17, s15, 0 ; GFX6-NEXT: s_cmp_ge_u32 s17, s9 ; GFX6-NEXT: s_cselect_b32 s18, -1, 0 @@ -9141,13 +9113,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_cmp_eq_u32 s17, s9 ; GFX6-NEXT: s_cselect_b32 s18, s19, s18 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s15, s15, s9 -; GFX6-NEXT: s_sub_u32 s19, s16, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_subb_u32 s12, s15, s9 +; GFX6-NEXT: s_sub_u32 s13, s16, s8 +; GFX6-NEXT: s_subb_u32 s12, s12, 0 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_cselect_b32 s13, s19, s16 +; GFX6-NEXT: s_cselect_b32 s13, s13, s16 ; GFX6-NEXT: s_cselect_b32 s12, s12, s17 ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_subb_u32 s4, s7, s14 @@ -9164,6 +9134,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -9405,8 +9376,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_u32 s12, 0, s2 -; GFX6-NEXT: s_subb_u32 s13, 0, s3 +; GFX6-NEXT: s_sub_u32 s6, 0, s2 +; GFX6-NEXT: s_subb_u32 s7, 0, s3 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9415,69 +9386,65 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v0 -; GFX6-NEXT: s_mul_i32 s7, s12, s14 +; GFX6-NEXT: v_mul_hi_u32 v2, s6, v0 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_mul_i32 s14, s6, s12 ; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_mul_i32 s15, s13, s6 -; GFX6-NEXT: s_mul_i32 s16, s12, s6 -; GFX6-NEXT: s_add_i32 s7, s17, s7 +; GFX6-NEXT: s_mul_i32 s15, s7, s13 +; GFX6-NEXT: s_mul_i32 s16, s6, s13 +; GFX6-NEXT: s_add_i32 s14, s17, s14 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16 -; GFX6-NEXT: s_add_i32 s7, s7, s15 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s7 +; GFX6-NEXT: s_add_i32 s14, s14, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s14 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16 ; GFX6-NEXT: v_readfirstlane_b32 s15, v3 -; GFX6-NEXT: s_mul_i32 s18, s6, s7 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s7 +; GFX6-NEXT: s_mul_i32 s18, s13, s14 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s14 ; GFX6-NEXT: s_add_u32 s15, s15, s18 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_mul_i32 s16, s14, s16 +; GFX6-NEXT: s_mul_i32 s16, s12, s16 ; GFX6-NEXT: s_addc_u32 s18, 0, s18 ; GFX6-NEXT: v_readfirstlane_b32 s17, v4 ; GFX6-NEXT: s_add_u32 s15, s15, s16 ; GFX6-NEXT: s_addc_u32 s15, s18, s17 ; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 -; GFX6-NEXT: s_mul_i32 s7, s14, s7 -; GFX6-NEXT: s_add_u32 s7, s15, s7 +; GFX6-NEXT: s_mul_i32 s14, s12, s14 +; GFX6-NEXT: s_add_u32 s14, s15, s14 ; GFX6-NEXT: s_addc_u32 s15, 0, s16 -; GFX6-NEXT: s_add_u32 s16, s6, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_addc_u32 s14, s14, s15 -; GFX6-NEXT: s_mul_i32 s6, s12, s14 -; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_add_i32 s6, s7, s6 -; GFX6-NEXT: s_mul_i32 s13, s13, s16 -; GFX6-NEXT: s_mul_i32 s7, s12, s16 -; GFX6-NEXT: s_add_i32 s6, s6, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 -; GFX6-NEXT: s_mul_i32 s13, s16, s6 -; GFX6-NEXT: v_readfirstlane_b32 s17, v2 -; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: s_add_u32 s13, s13, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: s_addc_u32 s12, s12, s15 +; GFX6-NEXT: s_mul_i32 s14, s6, s12 +; GFX6-NEXT: s_mul_i32 s7, s7, s13 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 -; GFX6-NEXT: s_mul_i32 s7, s14, s7 -; GFX6-NEXT: s_addc_u32 s15, 0, s15 -; GFX6-NEXT: v_readfirstlane_b32 s12, v3 -; GFX6-NEXT: s_add_u32 s7, s13, s7 -; GFX6-NEXT: s_addc_u32 s7, s15, s12 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: s_addc_u32 s12, s12, 0 -; GFX6-NEXT: s_mul_i32 s6, s14, s6 -; GFX6-NEXT: s_add_u32 s6, s7, s6 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: s_add_u32 s13, s16, s6 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX6-NEXT: s_or_b32 s6, s6, s7 -; GFX6-NEXT: s_addc_u32 s12, s14, s12 +; GFX6-NEXT: s_add_i32 s14, s15, s14 +; GFX6-NEXT: s_mul_i32 s6, s6, s13 +; GFX6-NEXT: s_add_i32 s7, s14, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 +; GFX6-NEXT: s_mul_i32 s15, s13, s7 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s15, s17, s15 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s6, s12, s6 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_add_u32 s6, s15, s6 +; GFX6-NEXT: s_addc_u32 s6, s16, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s7, s12, s7 +; GFX6-NEXT: s_add_u32 s6, s6, s7 +; GFX6-NEXT: s_addc_u32 s7, 0, s14 +; GFX6-NEXT: s_add_u32 s13, s13, s6 +; GFX6-NEXT: s_addc_u32 s12, s12, s7 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -9514,11 +9481,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 ; GFX6-NEXT: s_subb_u32 s17, s14, s3 ; GFX6-NEXT: s_sub_u32 s18, s8, s2 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s14, s15 ; GFX6-NEXT: s_subb_u32 s19, s17, 0 ; GFX6-NEXT: s_cmp_ge_u32 s19, s3 ; GFX6-NEXT: s_cselect_b32 s20, -1, 0 @@ -9527,13 +9492,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s19, s3 ; GFX6-NEXT: s_cselect_b32 s20, s21, s20 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s17, s17, s3 -; GFX6-NEXT: s_sub_u32 s21, s18, s2 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s17, 0 +; GFX6-NEXT: s_subb_u32 s14, s17, s3 +; GFX6-NEXT: s_sub_u32 s15, s18, s2 +; GFX6-NEXT: s_subb_u32 s14, s14, 0 ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s15, s21, s18 +; GFX6-NEXT: s_cselect_b32 s15, s15, s18 ; GFX6-NEXT: s_cselect_b32 s14, s14, s19 ; GFX6-NEXT: s_or_b32 s12, s12, s13 ; GFX6-NEXT: s_subb_u32 s9, s9, s16 @@ -9556,8 +9519,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_u32 s8, 0, s6 -; GFX6-NEXT: s_subb_u32 s9, 0, s7 +; GFX6-NEXT: s_sub_u32 s2, 0, s6 +; GFX6-NEXT: s_subb_u32 s3, 0, s7 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9566,70 +9529,66 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s12 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s0, s9, s2 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s13, s8, s2 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-NEXT: s_mul_i32 s4, s2, s3 -; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s8, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s9, s2, s8 +; GFX6-NEXT: v_readfirstlane_b32 s12, v2 +; GFX6-NEXT: s_mul_i32 s1, s3, s0 +; GFX6-NEXT: s_add_i32 s9, s12, s9 +; GFX6-NEXT: s_add_i32 s9, s9, s1 +; GFX6-NEXT: s_mul_i32 s1, s2, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s9 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 +; GFX6-NEXT: s_mul_i32 s12, s0, s9 +; GFX6-NEXT: v_readfirstlane_b32 s13, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 -; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 -; GFX6-NEXT: s_add_u32 s4, s16, s4 -; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s9 +; GFX6-NEXT: s_add_u32 s12, s16, s12 +; GFX6-NEXT: s_addc_u32 s13, 0, s13 +; GFX6-NEXT: s_mul_i32 s1, s8, s1 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s13 -; GFX6-NEXT: s_addc_u32 s4, s5, s16 -; GFX6-NEXT: v_readfirstlane_b32 s5, v1 -; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s12, s3 -; GFX6-NEXT: s_add_u32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s4, 0, s5 -; GFX6-NEXT: s_add_u32 s5, s2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s12, s4 -; GFX6-NEXT: s_mul_i32 s2, s8, s4 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_add_i32 s2, s3, s2 -; GFX6-NEXT: s_mul_i32 s9, s9, s5 -; GFX6-NEXT: s_mul_i32 s3, s8, s5 -; GFX6-NEXT: s_add_i32 s2, s2, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_add_u32 s1, s12, s1 +; GFX6-NEXT: s_addc_u32 s1, s13, s16 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s9, s8, s9 +; GFX6-NEXT: s_add_u32 s1, s1, s9 +; GFX6-NEXT: s_addc_u32 s9, 0, s12 +; GFX6-NEXT: s_add_u32 s12, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_addc_u32 s4, s8, s9 +; GFX6-NEXT: s_mul_i32 s5, s2, s4 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: s_add_i32 s5, s8, s5 +; GFX6-NEXT: s_mul_i32 s3, s3, s12 +; GFX6-NEXT: s_mul_i32 s2, s2, s12 +; GFX6-NEXT: s_add_i32 s3, s5, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v2 ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_mul_i32 s9, s5, s2 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_mul_i32 s8, s12, s3 ; GFX6-NEXT: v_readfirstlane_b32 s13, v2 -; GFX6-NEXT: s_add_u32 s9, s13, s9 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 -; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 -; GFX6-NEXT: v_readfirstlane_b32 s8, v3 -; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s12, s8 -; GFX6-NEXT: v_readfirstlane_b32 s8, v1 -; GFX6-NEXT: s_addc_u32 s8, s8, 0 +; GFX6-NEXT: s_add_u32 s8, s13, s8 +; GFX6-NEXT: v_readfirstlane_b32 s9, v0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 -; GFX6-NEXT: s_add_u32 s2, s3, s2 -; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s12, s5, s2 -; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s13, s4, s8 +; GFX6-NEXT: s_addc_u32 s9, 0, s9 +; GFX6-NEXT: v_readfirstlane_b32 s5, v3 +; GFX6-NEXT: s_add_u32 s2, s8, s2 +; GFX6-NEXT: s_addc_u32 s2, s9, s5 +; GFX6-NEXT: v_readfirstlane_b32 s5, v1 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_mul_i32 s3, s4, s3 +; GFX6-NEXT: s_add_u32 s2, s2, s3 +; GFX6-NEXT: s_addc_u32 s3, 0, s5 +; GFX6-NEXT: s_add_u32 s12, s12, s2 +; GFX6-NEXT: s_addc_u32 s13, s4, s3 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 @@ -9667,11 +9626,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s13, s10, s11 ; GFX6-NEXT: s_subb_u32 s17, s12, s7 ; GFX6-NEXT: s_sub_u32 s18, s8, s6 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s12, s13 ; GFX6-NEXT: s_subb_u32 s19, s17, 0 ; GFX6-NEXT: s_cmp_ge_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s20, -1, 0 @@ -9680,13 +9637,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_eq_u32 s19, s7 ; GFX6-NEXT: s_cselect_b32 s20, s21, s20 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s17, s7 -; GFX6-NEXT: s_sub_u32 s21, s18, s6 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s17, 0 +; GFX6-NEXT: s_subb_u32 s12, s17, s7 +; GFX6-NEXT: s_sub_u32 s13, s18, s6 +; GFX6-NEXT: s_subb_u32 s12, s12, 0 ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s13, s21, s18 +; GFX6-NEXT: s_cselect_b32 s13, s13, s18 ; GFX6-NEXT: s_cselect_b32 s12, s12, s19 ; GFX6-NEXT: s_or_b32 s10, s10, s11 ; GFX6-NEXT: s_subb_u32 s9, s9, s16 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index b96de173dc8c6..8d05317162e9c 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -702,8 +702,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_add_u32 s4, s4, s6 -; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CISI-NEXT: s_or_b32 s6, s12, s13 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1674,8 +1672,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_sub_u32 s4, s4, s6 -; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CISI-NEXT: s_or_b32 s6, s12, s13 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index dbdea8e3c533d..71af21a11c2ce 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -12,8 +12,6 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s7, s6, s6 -; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -88,15 +86,13 @@ bb: define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s2, s2 -; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_addc_u32 s0, s2, 0 +; GFX7-NEXT: s_add_u32 s1, s0, s0 +; GFX7-NEXT: s_addc_u32 s0, s0, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX7-NEXT: s_cbranch_vccnz .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 71f5a94a7f245..74a6d7fe39362 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -8,7 +8,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s8, s1, 31 ; GCN-NEXT: s_add_u32 s0, s0, s8 @@ -17,8 +16,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GCN-NEXT: s_sub_u32 s12, 0, s10 -; GCN-NEXT: s_subb_u32 s13, 0, s11 +; GCN-NEXT: s_sub_u32 s0, 0, s10 +; GCN-NEXT: s_subb_u32 s1, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -27,128 +26,121 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s17, v2 -; GCN-NEXT: s_mul_i32 s15, s13, s0 -; GCN-NEXT: s_mul_i32 s16, s12, s0 -; GCN-NEXT: s_add_i32 s1, s17, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s16 -; GCN-NEXT: s_add_i32 s1, s1, s15 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s16 -; GCN-NEXT: v_readfirstlane_b32 s15, v3 -; GCN-NEXT: s_mul_i32 s17, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s15, s15, s17 -; GCN-NEXT: v_readfirstlane_b32 s17, v0 -; GCN-NEXT: s_addc_u32 s17, 0, s17 -; GCN-NEXT: s_mul_i32 s16, s14, s16 -; GCN-NEXT: v_readfirstlane_b32 s18, v4 -; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_addc_u32 s15, s17, s18 -; GCN-NEXT: v_readfirstlane_b32 s16, v1 -; GCN-NEXT: s_addc_u32 s16, s16, 0 -; GCN-NEXT: s_mul_i32 s1, s14, s1 -; GCN-NEXT: s_add_u32 s1, s15, s1 -; GCN-NEXT: s_addc_u32 s15, 0, s16 -; GCN-NEXT: s_add_u32 s16, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s14, s14, s15 -; GCN-NEXT: s_mul_i32 s0, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s1, s12, s16 -; GCN-NEXT: s_add_i32 s0, s0, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s16, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s14, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s16, v0 -; GCN-NEXT: s_mul_i32 s13, s16, s0 -; GCN-NEXT: v_readfirstlane_b32 s17, v2 -; GCN-NEXT: s_add_u32 s13, s17, s13 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s1, s14, s1 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s12, v3 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s1, s15, s12 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s0, s14, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s15, s16, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s14, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s13, s0, s12 +; GCN-NEXT: v_readfirstlane_b32 s16, v2 +; GCN-NEXT: s_mul_i32 s14, s1, s2 +; GCN-NEXT: s_mul_i32 s15, s0, s2 +; GCN-NEXT: s_add_i32 s13, s16, s13 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s15 +; GCN-NEXT: s_add_i32 s13, s13, s14 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s13 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s15 +; GCN-NEXT: v_readfirstlane_b32 s14, v3 +; GCN-NEXT: s_mul_i32 s16, s2, s13 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s13 +; GCN-NEXT: s_add_u32 s14, s14, s16 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s15, s12, s15 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: v_readfirstlane_b32 s17, v4 +; GCN-NEXT: s_add_u32 s14, s14, s15 +; GCN-NEXT: s_addc_u32 s14, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: s_addc_u32 s15, s15, 0 +; GCN-NEXT: s_mul_i32 s13, s12, s13 +; GCN-NEXT: s_add_u32 s13, s14, s13 +; GCN-NEXT: s_addc_u32 s14, 0, s15 +; GCN-NEXT: s_add_u32 s13, s2, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s14, s0, s12 +; GCN-NEXT: s_mul_i32 s1, s1, s13 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: s_add_i32 s14, s15, s14 +; GCN-NEXT: s_mul_i32 s0, s0, s13 +; GCN-NEXT: s_add_i32 s1, s14, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s13, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 +; GCN-NEXT: s_mul_i32 s15, s13, s1 +; GCN-NEXT: v_readfirstlane_b32 s17, v2 +; GCN-NEXT: s_add_u32 s15, s17, s15 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s0, s12, s0 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: v_readfirstlane_b32 s14, v3 +; GCN-NEXT: s_add_u32 s0, s15, s0 +; GCN-NEXT: s_addc_u32 s0, s16, s14 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s1, s12, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s14 +; GCN-NEXT: s_add_u32 s14, s13, s0 +; GCN-NEXT: s_addc_u32 s15, s12, s1 ; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s12 ; GCN-NEXT: s_mov_b32 s13, s12 ; GCN-NEXT: s_addc_u32 s1, s7, s12 ; GCN-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_mul_hi_u32 v1, s7, v2 -; GCN-NEXT: s_mul_i32 s1, s6, s14 +; GCN-NEXT: s_mul_i32 s1, s6, s15 ; GCN-NEXT: v_readfirstlane_b32 s16, v3 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 ; GCN-NEXT: s_add_u32 s1, s16, s1 ; GCN-NEXT: s_addc_u32 s4, 0, s4 -; GCN-NEXT: s_mul_i32 s15, s7, s15 +; GCN-NEXT: s_mul_i32 s14, s7, s14 ; GCN-NEXT: v_readfirstlane_b32 s16, v1 -; GCN-NEXT: s_add_u32 s1, s1, s15 +; GCN-NEXT: s_add_u32 s1, s1, s14 ; GCN-NEXT: s_addc_u32 s1, s4, s16 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_addc_u32 s4, s4, 0 -; GCN-NEXT: s_mul_i32 s14, s7, s14 -; GCN-NEXT: s_add_u32 s16, s1, s14 -; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: s_mul_i32 s14, s7, s15 +; GCN-NEXT: s_add_u32 s14, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_addc_u32 s17, 0, s4 +; GCN-NEXT: s_addc_u32 s15, 0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mul_i32 s4, s10, s17 +; GCN-NEXT: s_mul_i32 s4, s10, s15 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s5, s11, s16 -; GCN-NEXT: s_add_i32 s18, s4, s5 -; GCN-NEXT: s_sub_i32 s14, s7, s18 -; GCN-NEXT: s_mul_i32 s4, s10, s16 +; GCN-NEXT: s_mul_i32 s5, s11, s14 +; GCN-NEXT: s_add_i32 s16, s4, s5 +; GCN-NEXT: s_sub_i32 s17, s7, s16 +; GCN-NEXT: s_mul_i32 s4, s10, s14 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s15, s4, s5 -; GCN-NEXT: s_subb_u32 s19, s14, s11 -; GCN-NEXT: s_sub_u32 s20, s6, s10 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_or_b32 s14, s14, s15 -; GCN-NEXT: s_subb_u32 s14, s19, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s20, s10 +; GCN-NEXT: s_subb_u32 s17, s17, s11 +; GCN-NEXT: s_sub_u32 s18, s6, s10 +; GCN-NEXT: s_subb_u32 s17, s17, 0 +; GCN-NEXT: s_cmp_ge_u32 s17, s11 ; GCN-NEXT: s_cselect_b32 s19, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s14, s19, s15 -; GCN-NEXT: s_add_u32 s15, s16, 1 -; GCN-NEXT: s_addc_u32 s19, s17, 0 -; GCN-NEXT: s_add_u32 s20, s16, 2 -; GCN-NEXT: s_addc_u32 s21, s17, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s14, s20, s15 -; GCN-NEXT: s_cselect_b32 s15, s21, s19 +; GCN-NEXT: s_cmp_ge_u32 s18, s10 +; GCN-NEXT: s_cselect_b32 s18, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s17, s11 +; GCN-NEXT: s_cselect_b32 s17, s18, s19 +; GCN-NEXT: s_add_u32 s18, s14, 1 +; GCN-NEXT: s_addc_u32 s19, s15, 0 +; GCN-NEXT: s_add_u32 s20, s14, 2 +; GCN-NEXT: s_addc_u32 s21, s15, 0 +; GCN-NEXT: s_cmp_lg_u32 s17, 0 +; GCN-NEXT: s_cselect_b32 s17, s20, s18 +; GCN-NEXT: s_cselect_b32 s18, s21, s19 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s18 +; GCN-NEXT: s_subb_u32 s4, s7, s16 ; GCN-NEXT: s_cmp_ge_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s10 @@ -156,13 +148,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_eq_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s4, s6, s5 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s15, s17 -; GCN-NEXT: s_cselect_b32 s4, s14, s16 +; GCN-NEXT: s_cselect_b32 s5, s18, s15 +; GCN-NEXT: s_cselect_b32 s4, s17, s14 ; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 ; GCN-NEXT: s_subb_u32 s5, s5, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -202,8 +195,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s10, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s10, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 @@ -235,8 +226,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_sub_u32 s16, s16, s20 ; GCN-IR-NEXT: s_subb_u32 s17, s17, s21 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-IR-NEXT: s_or_b32 s20, s20, s21 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9] @@ -1150,8 +1139,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s2, 0, s6 -; GCN-NEXT: s_subb_u32 s10, 0, s7 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_subb_u32 s8, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1161,115 +1149,109 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_mul_i32 s9, s2, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v2 -; GCN-NEXT: s_mul_i32 s12, s10, s8 -; GCN-NEXT: s_mul_i32 s13, s2, s8 -; GCN-NEXT: s_add_i32 s9, s14, s9 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 -; GCN-NEXT: s_add_i32 s9, s9, s12 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v3 -; GCN-NEXT: s_mul_i32 s15, s8, s9 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 -; GCN-NEXT: s_add_u32 s12, s12, s15 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s14, v4 -; GCN-NEXT: s_add_u32 s12, s12, s13 -; GCN-NEXT: s_addc_u32 s12, s15, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NEXT: s_mul_i32 s9, s11, s9 -; GCN-NEXT: s_add_u32 s9, s12, s9 -; GCN-NEXT: s_addc_u32 s12, 0, s13 -; GCN-NEXT: s_add_u32 s13, s8, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_mul_i32 s10, s2, s9 +; GCN-NEXT: v_readfirstlane_b32 s13, v2 +; GCN-NEXT: s_mul_i32 s11, s8, s3 +; GCN-NEXT: s_mul_i32 s12, s2, s3 +; GCN-NEXT: s_add_i32 s10, s13, s10 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s12 +; GCN-NEXT: s_add_i32 s10, s10, s11 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s10 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s10 +; GCN-NEXT: s_mul_i32 s14, s3, s10 +; GCN-NEXT: s_add_u32 s11, s11, s14 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s12, s9, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v4 +; GCN-NEXT: s_add_u32 s11, s11, s12 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: s_addc_u32 s11, s14, s13 +; GCN-NEXT: s_addc_u32 s12, s15, 0 +; GCN-NEXT: s_mul_i32 s10, s9, s10 +; GCN-NEXT: s_add_u32 s10, s11, s10 +; GCN-NEXT: s_addc_u32 s11, 0, s12 +; GCN-NEXT: s_add_u32 s10, s3, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s11, s11, s12 -; GCN-NEXT: s_mul_i32 s8, s2, s11 -; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s8, s9, s8 -; GCN-NEXT: s_mul_i32 s10, s10, s13 -; GCN-NEXT: s_mul_i32 s2, s2, s13 -; GCN-NEXT: s_add_i32 s8, s8, s10 +; GCN-NEXT: s_addc_u32 s9, s9, s11 +; GCN-NEXT: s_mul_i32 s11, s2, s9 +; GCN-NEXT: s_mul_i32 s8, s8, s10 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_add_i32 s11, s12, s11 +; GCN-NEXT: s_mul_i32 s2, s2, s10 +; GCN-NEXT: s_add_i32 s8, s11, s8 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mul_hi_u32 v3, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s13, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 -; GCN-NEXT: s_mul_i32 s10, s13, s8 +; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_mul_i32 s12, s10, s8 ; GCN-NEXT: v_readfirstlane_b32 s14, v2 -; GCN-NEXT: s_add_u32 s10, s14, s10 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s2, s11, s2 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: v_readfirstlane_b32 s9, v3 -; GCN-NEXT: s_add_u32 s2, s10, s2 -; GCN-NEXT: s_addc_u32 s2, s12, s9 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: s_mul_i32 s8, s11, s8 +; GCN-NEXT: s_add_u32 s12, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s2, s9, s2 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s11, v3 +; GCN-NEXT: s_add_u32 s2, s12, s2 +; GCN-NEXT: s_addc_u32 s2, s13, s11 +; GCN-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NEXT: s_addc_u32 s11, s11, 0 +; GCN-NEXT: s_mul_i32 s8, s9, s8 ; GCN-NEXT: s_add_u32 s2, s2, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s9 -; GCN-NEXT: s_add_u32 s2, s13, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s8, s11, s10 +; GCN-NEXT: s_addc_u32 s8, 0, s11 +; GCN-NEXT: s_add_u32 s2, s10, s2 +; GCN-NEXT: s_addc_u32 s8, s9, s8 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 ; GCN-NEXT: s_mul_i32 s8, s8, 24 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 ; GCN-NEXT: s_add_u32 s8, s10, s8 -; GCN-NEXT: s_addc_u32 s12, 0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: s_addc_u32 s10, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_mul_i32 s8, s7, s12 +; GCN-NEXT: s_mul_i32 s8, s7, s10 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s13, s9, s8 -; GCN-NEXT: s_sub_i32 s10, 0, s13 -; GCN-NEXT: s_mul_i32 s8, s6, s12 -; GCN-NEXT: s_sub_u32 s14, 24, s8 +; GCN-NEXT: s_add_i32 s11, s9, s8 +; GCN-NEXT: s_sub_i32 s12, 0, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s10 +; GCN-NEXT: s_sub_u32 s13, 24, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s11, s8, s9 -; GCN-NEXT: s_subb_u32 s15, s10, s7 -; GCN-NEXT: s_sub_u32 s16, s14, s6 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s15, 0 -; GCN-NEXT: s_cmp_ge_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s11, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s6 +; GCN-NEXT: s_subb_u32 s12, s12, s7 +; GCN-NEXT: s_sub_u32 s14, s13, s6 +; GCN-NEXT: s_subb_u32 s12, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s12, s7 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s10, s15, s11 -; GCN-NEXT: s_add_u32 s11, s12, 1 +; GCN-NEXT: s_cmp_ge_u32 s14, s6 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s12, s7 +; GCN-NEXT: s_cselect_b32 s12, s14, s15 +; GCN-NEXT: s_add_u32 s14, s10, 1 ; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_add_u32 s16, s12, 2 +; GCN-NEXT: s_add_u32 s16, s10, 2 ; GCN-NEXT: s_addc_u32 s17, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_cselect_b32 s10, s16, s11 -; GCN-NEXT: s_cselect_b32 s11, s17, s15 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_cselect_b32 s12, s16, s14 +; GCN-NEXT: s_cselect_b32 s14, s17, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, 0, s13 +; GCN-NEXT: s_subb_u32 s8, 0, s11 ; GCN-NEXT: s_cmp_ge_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s6 +; GCN-NEXT: s_cmp_ge_u32 s13, s6 ; GCN-NEXT: s_cselect_b32 s6, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s6, s6, s9 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s7, s11, 0 -; GCN-NEXT: s_cselect_b32 s6, s10, s12 +; GCN-NEXT: s_cselect_b32 s7, s14, 0 +; GCN-NEXT: s_cselect_b32 s6, s12, s10 ; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_subb_u32 s7, s7, s4 @@ -1303,8 +1285,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -1335,8 +1315,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ea9bb0417dfa4..862e2dd2de051 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s10, 0, s8 -; GCN-NEXT: s_subb_u32 s11, 0, s9 +; GCN-NEXT: s_sub_u32 s0, 0, s8 +; GCN-NEXT: s_subb_u32 s1, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s0 -; GCN-NEXT: s_mul_i32 s14, s10, s0 -; GCN-NEXT: s_add_i32 s1, s15, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s1, s1, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s0, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s1, s2 +; GCN-NEXT: s_mul_i32 s13, s0, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s15, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s15 ; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 ; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s0, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s1, s10, s14 -; GCN-NEXT: s_add_i32 s0, s0, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s0 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s15, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s0, s10 +; GCN-NEXT: s_mul_i32 s1, s1, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s1, s11, s1 -; GCN-NEXT: s_addc_u32 s1, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s0, s0, s11 +; GCN-NEXT: s_add_i32 s1, s12, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s1 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s0, s10, s0 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s0, s13, s0 +; GCN-NEXT: s_addc_u32 s0, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s1, s10, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s0 +; GCN-NEXT: s_addc_u32 s1, s10, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s11 @@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 ; GCN-NEXT: s_subb_u32 s13, s10, s9 ; GCN-NEXT: s_sub_u32 s14, s6, s8 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s13, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 @@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_eq_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, s17, s16 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_subb_u32 s10, s13, s9 +; GCN-NEXT: s_sub_u32 s11, s14, s8 +; GCN-NEXT: s_subb_u32 s10, s10, 0 ; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s11, s11, s14 ; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_subb_u32 s4, s7, s12 @@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_cmp_lg_u32 s5, 0 ; GCN-NEXT: s_cselect_b32 s4, s10, s4 ; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GCN-NEXT: s_sub_u32 s10, 0, s4 -; GCN-NEXT: s_subb_u32 s11, 0, s5 +; GCN-NEXT: s_sub_u32 s8, 0, s4 +; GCN-NEXT: s_subb_u32 s9, 0, s5 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_mul_i32 s9, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s8 -; GCN-NEXT: s_mul_i32 s14, s10, s8 -; GCN-NEXT: s_add_i32 s9, s15, s9 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s9, s9, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s8, s9 -; GCN-NEXT: s_add_u32 s13, s13, s15 -; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: v_mul_hi_u32 v0, v1, s9 -; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: s_mul_i32 s14, s12, s14 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s8, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s9, s2 +; GCN-NEXT: s_mul_i32 s13, s8, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s14, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s9, s12, s9 -; GCN-NEXT: s_add_u32 s9, s13, s9 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s8, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s8, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s8, s9, s8 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s9, s10, s14 -; GCN-NEXT: s_add_i32 s8, s8, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s8 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 +; GCN-NEXT: v_readfirstlane_b32 s15, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s8, s10 +; GCN-NEXT: s_mul_i32 s9, s9, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s9, s12, s9 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s9, s11, s9 -; GCN-NEXT: s_addc_u32 s9, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s8, s12, s8 -; GCN-NEXT: s_add_u32 s8, s9, s8 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s8 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_addc_u32 s10, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s8, s8, s11 +; GCN-NEXT: s_add_i32 s9, s12, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s9 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s8, s10, s8 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s8, s13, s8 +; GCN-NEXT: s_addc_u32 s8, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s9, s10, s9 +; GCN-NEXT: s_add_u32 s8, s8, s9 +; GCN-NEXT: s_addc_u32 s9, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s8 +; GCN-NEXT: s_addc_u32 s10, s10, s9 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 ; GCN-NEXT: s_mov_b32 s9, s8 @@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: s_addc_u32 s11, 0, s12 ; GCN-NEXT: s_mul_i32 s11, s4, s11 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 @@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s13, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s12, s5 ; GCN-NEXT: s_sub_u32 s16, s6, s4 ; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s17, s12, s13 ; GCN-NEXT: s_subb_u32 s17, s15, 0 ; GCN-NEXT: s_cmp_ge_u32 s17, s5 ; GCN-NEXT: s_cselect_b32 s18, -1, 0 @@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_cmp_eq_u32 s17, s5 ; GCN-NEXT: s_cselect_b32 s18, s19, s18 ; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s15, s15, s5 -; GCN-NEXT: s_sub_u32 s19, s16, s4 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s12, s15, 0 +; GCN-NEXT: s_subb_u32 s12, s15, s5 +; GCN-NEXT: s_sub_u32 s13, s16, s4 +; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_cselect_b32 s13, s19, s16 +; GCN-NEXT: s_cselect_b32 s13, s13, s16 ; GCN-NEXT: s_cselect_b32 s12, s12, s17 ; GCN-NEXT: s_or_b32 s10, s10, s11 ; GCN-NEXT: s_subb_u32 s7, s7, s14 @@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s10, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_sub_u32 s14, s14, s20 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s21 ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 -; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GCN-IR-NEXT: s_or_b32 s20, s20, s21 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GCN-NEXT: s_sub_u32 s2, 0, s4 -; GCN-NEXT: s_subb_u32 s8, 0, s5 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_subb_u32 s6, 0, s5 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_mul_i32 s7, s2, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s6 -; GCN-NEXT: s_mul_i32 s11, s2, s6 -; GCN-NEXT: s_add_i32 s7, s12, s7 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s7, s7, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_mul_i32 s13, s6, s7 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s7 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: s_addc_u32 s11, s11, 0 -; GCN-NEXT: s_mul_i32 s7, s9, s7 -; GCN-NEXT: s_add_u32 s7, s10, s7 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s6, s7 -; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_mul_i32 s8, s2, s7 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s6, s3 +; GCN-NEXT: s_mul_i32 s10, s2, s3 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_mul_i32 s12, s3, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s7, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_mul_i32 s8, s7, s8 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s3, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s6, s2, s9 -; GCN-NEXT: v_readfirstlane_b32 s7, v0 -; GCN-NEXT: s_add_i32 s6, s7, s6 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s2, s2, s11 -; GCN-NEXT: s_add_i32 s6, s6, s8 +; GCN-NEXT: s_addc_u32 s7, s7, s9 +; GCN-NEXT: s_mul_i32 s9, s2, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s2, s2, s8 +; GCN-NEXT: s_add_i32 s6, s9, s6 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s6 +; GCN-NEXT: v_mul_hi_u32 v3, s7, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s6 ; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s2, s9, s2 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s7, v3 -; GCN-NEXT: s_add_u32 s2, s8, s2 -; GCN-NEXT: s_addc_u32 s2, s10, s7 -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: s_addc_u32 s7, s7, 0 -; GCN-NEXT: s_mul_i32 s6, s9, s6 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s2, s7, s2 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s2, s10, s2 +; GCN-NEXT: s_addc_u32 s2, s11, s9 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s6, s7, s6 ; GCN-NEXT: s_add_u32 s2, s2, s6 -; GCN-NEXT: s_addc_u32 s8, 0, s7 -; GCN-NEXT: s_add_u32 s2, s11, s2 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_addc_u32 s6, s9, s8 +; GCN-NEXT: s_addc_u32 s6, 0, s9 +; GCN-NEXT: s_add_u32 s2, s8, s2 +; GCN-NEXT: s_addc_u32 s6, s7, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 ; GCN-NEXT: s_mul_i32 s6, s6, 24 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 ; GCN-NEXT: s_add_u32 s6, s8, s6 @@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: s_add_i32 s10, s8, s7 ; GCN-NEXT: s_sub_i32 s8, 0, s10 ; GCN-NEXT: s_sub_u32 s11, 24, s6 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s9, s6, s7 ; GCN-NEXT: s_subb_u32 s12, s8, s5 ; GCN-NEXT: s_sub_u32 s13, s11, s4 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 ; GCN-NEXT: s_subb_u32 s14, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s14, s5 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 @@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_eq_u32 s14, s5 ; GCN-NEXT: s_cselect_b32 s15, s16, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s5 -; GCN-NEXT: s_sub_u32 s16, s13, s4 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, s12, s5 +; GCN-NEXT: s_sub_u32 s9, s13, s4 +; GCN-NEXT: s_subb_u32 s8, s8, 0 ; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s9, s9, s13 ; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 ; GCN-NEXT: s_subb_u32 s6, 0, s10 @@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 -; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-IR-NEXT: s_or_b32 s9, s10, s11 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index bdd22f25e91c8..b000fae124ede 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -15,10 +15,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_add_u32 s2, s2, s8 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: s_addc_u32 s3, s3, s9 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -433,8 +431,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s4, s4, s6 -; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; SI-NEXT: s_or_b32 s6, s12, s13 ; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index fd461ac80ea55..775483c040b7f 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -146,8 +146,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -179,8 +177,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5] @@ -786,12 +782,11 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-LABEL: s_test_udiv_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s6, 0, s2 -; GCN-NEXT: s_subb_u32 s8, 0, s3 +; GCN-NEXT: s_sub_u32 s4, 0, s2 +; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -800,118 +795,112 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s4, v0 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_mul_i32 s8, s4, s6 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s5, s7 +; GCN-NEXT: s_mul_i32 s10, s4, s7 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_mul_i32 s12, s7, s8 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s6, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s8 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s7, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: s_addc_u32 s6, s6, s9 +; GCN-NEXT: s_mul_i32 s9, s4, s6 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s4, s4, s8 +; GCN-NEXT: s_add_i32 s5, s9, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s5 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s4, s10, s4 +; GCN-NEXT: s_addc_u32 s4, s11, s9 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s5, s6, s5 +; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s9 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s5, s6, s5 +; GCN-NEXT: v_mul_hi_u32 v1, s4, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s5, 24 +; GCN-NEXT: s_mul_i32 s5, s5, 24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_mul_i32 s5, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s4 -; GCN-NEXT: s_mul_i32 s11, s6, s4 -; GCN-NEXT: s_add_i32 s5, s12, s5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s5, s5, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s5 -; GCN-NEXT: s_mul_i32 s13, s4, s5 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: s_addc_u32 s11, s14, 0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_add_u32 s5, s10, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s4, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s4, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s5, s6, s11 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s4 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s6, v3 ; GCN-NEXT: s_add_u32 s5, s8, s5 -; GCN-NEXT: s_addc_u32 s5, s10, s6 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: s_addc_u32 s6, s6, 0 -; GCN-NEXT: s_mul_i32 s4, s9, s4 -; GCN-NEXT: s_add_u32 s4, s5, s4 -; GCN-NEXT: s_addc_u32 s6, 0, s6 -; GCN-NEXT: s_add_u32 s8, s11, s4 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s4, s9, s6 -; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 -; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 -; GCN-NEXT: s_mul_i32 s4, s4, 24 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s10, 0, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_addc_u32 s8, 0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mul_i32 s0, s3, s10 +; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s11, s1, s0 -; GCN-NEXT: s_sub_i32 s8, 0, s11 -; GCN-NEXT: s_mul_i32 s0, s2, s10 -; GCN-NEXT: s_sub_u32 s12, 24, s0 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_mul_i32 s0, s2, s8 +; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s9, s0, s1 -; GCN-NEXT: s_subb_u32 s13, s8, s3 -; GCN-NEXT: s_sub_u32 s14, s12, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s2 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s12, s11, s2 +; GCN-NEXT: s_subb_u32 s10, s10, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s3 ; GCN-NEXT: s_cselect_b32 s13, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s8, s13, s9 -; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, s3 +; GCN-NEXT: s_cselect_b32 s10, s12, s13 +; GCN-NEXT: s_add_u32 s12, s8, 1 ; GCN-NEXT: s_addc_u32 s13, 0, 0 -; GCN-NEXT: s_add_u32 s14, s10, 2 +; GCN-NEXT: s_add_u32 s14, s8, 2 ; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s8, s14, s9 -; GCN-NEXT: s_cselect_b32 s9, s15, s13 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s10, s14, s12 +; GCN-NEXT: s_cselect_b32 s12, s15, s13 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_subb_u32 s0, 0, s11 +; GCN-NEXT: s_subb_u32 s0, 0, s9 ; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 ; GCN-NEXT: s_cselect_b32 s2, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s0, s2, s1 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, s9, 0 -; GCN-NEXT: s_cselect_b32 s1, s8, s10 +; GCN-NEXT: s_cselect_b32 s0, s12, 0 +; GCN-NEXT: s_cselect_b32 s1, s10, s8 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -937,8 +926,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -969,8 +956,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1307,8 +1292,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1336,8 +1319,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s2, s2, s8 ; GCN-IR-NEXT: s_subb_u32 s3, s3, 0 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-IR-NEXT: s_or_b32 s12, s12, s13 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 137dc1fe42294..28e6627b87413 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s10, 0, s8 -; GCN-NEXT: s_subb_u32 s11, 0, s9 +; GCN-NEXT: s_sub_u32 s0, 0, s8 +; GCN-NEXT: s_subb_u32 s1, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 -; GCN-NEXT: v_readfirstlane_b32 s12, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_mul_i32 s1, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_mul_i32 s13, s11, s0 -; GCN-NEXT: s_mul_i32 s14, s10, s0 -; GCN-NEXT: s_add_i32 s1, s15, s1 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 -; GCN-NEXT: s_add_i32 s1, s1, s13 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 -; GCN-NEXT: v_readfirstlane_b32 s13, v3 -; GCN-NEXT: s_mul_i32 s15, s0, s1 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 -; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_mul_hi_u32 v2, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s11, s0, s10 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s1, s2 +; GCN-NEXT: s_mul_i32 s13, s0, s2 +; GCN-NEXT: s_add_i32 s11, s14, s11 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s11, s11, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s15, s2, s11 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s11 +; GCN-NEXT: s_add_u32 s12, s12, s15 ; GCN-NEXT: v_readfirstlane_b32 s15, v0 -; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_mul_i32 s13, s10, s13 ; GCN-NEXT: s_addc_u32 s15, 0, s15 -; GCN-NEXT: v_readfirstlane_b32 s16, v4 -; GCN-NEXT: s_add_u32 s13, s13, s14 -; GCN-NEXT: s_addc_u32 s13, s15, s16 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_add_u32 s1, s13, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s14 -; GCN-NEXT: s_add_u32 s14, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s12, s12, s13 -; GCN-NEXT: s_mul_i32 s0, s10, s12 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s0, s1, s0 -; GCN-NEXT: s_mul_i32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s1, s10, s14 -; GCN-NEXT: s_add_i32 s0, s0, s11 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 -; GCN-NEXT: s_mul_i32 s11, s14, s0 -; GCN-NEXT: v_readfirstlane_b32 s15, v2 -; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s15, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s11, s10, s11 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s11, s2, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 +; GCN-NEXT: s_addc_u32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s12, s0, s10 +; GCN-NEXT: s_mul_i32 s1, s1, s11 ; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: s_add_u32 s1, s11, s1 -; GCN-NEXT: s_addc_u32 s1, s13, s10 -; GCN-NEXT: v_readfirstlane_b32 s10, v1 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_u32 s0, s1, s0 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s11, s14, s0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s0, s0, s11 +; GCN-NEXT: s_add_i32 s1, s12, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s1 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s13, s15, s13 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s0, s10, s0 +; GCN-NEXT: s_addc_u32 s14, 0, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s0, s13, s0 +; GCN-NEXT: s_addc_u32 s0, s14, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s1, s10, s1 +; GCN-NEXT: s_add_u32 s0, s0, s1 +; GCN-NEXT: s_addc_u32 s1, 0, s12 +; GCN-NEXT: s_add_u32 s11, s11, s0 +; GCN-NEXT: s_addc_u32 s1, s10, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s11 @@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 ; GCN-NEXT: s_subb_u32 s13, s10, s9 ; GCN-NEXT: s_sub_u32 s14, s6, s8 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 ; GCN-NEXT: s_subb_u32 s15, s13, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 @@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_cmp_eq_u32 s15, s9 ; GCN-NEXT: s_cselect_b32 s16, s17, s16 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_subb_u32 s10, s13, s9 +; GCN-NEXT: s_sub_u32 s11, s14, s8 +; GCN-NEXT: s_subb_u32 s10, s10, 0 ; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 +; GCN-NEXT: s_cselect_b32 s11, s11, s14 ; GCN-NEXT: s_cselect_b32 s10, s10, s15 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_subb_u32 s4, s7, s12 @@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_cmp_lg_u32 s5, 0 ; GCN-NEXT: s_cselect_b32 s4, s10, s4 ; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-IR-NEXT: s_or_b32 s8, s8, s9 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_sub_u32 s12, s12, s18 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s19 ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 -; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GCN-IR-NEXT: s_or_b32 s18, s18, s19 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -803,12 +791,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-LABEL: s_test_urem_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s6, 0, s2 -; GCN-NEXT: s_subb_u32 s8, 0, s3 +; GCN-NEXT: s_sub_u32 s4, 0, s2 +; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -817,77 +804,73 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v2, s4, v0 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_mul_i32 s8, s4, s6 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: s_mul_i32 s9, s5, s7 +; GCN-NEXT: s_mul_i32 s10, s4, s7 +; GCN-NEXT: s_add_i32 s8, s11, s8 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 +; GCN-NEXT: s_add_i32 s8, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_mul_i32 s12, s7, s8 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s8 +; GCN-NEXT: s_add_u32 s9, s9, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s10, s6, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v4 +; GCN-NEXT: s_add_u32 s9, s9, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s9, s12, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s8 +; GCN-NEXT: s_addc_u32 s10, s13, 0 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s9, 0, s10 +; GCN-NEXT: s_add_u32 s8, s7, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: s_addc_u32 s6, s6, s9 +; GCN-NEXT: s_mul_i32 s9, s4, s6 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s9, s10, s9 +; GCN-NEXT: s_mul_i32 s4, s4, s8 +; GCN-NEXT: s_add_i32 s5, s9, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mul_i32 s10, s8, s5 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s10, s12, s10 +; GCN-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-NEXT: s_mul_i32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s4, s10, s4 +; GCN-NEXT: s_addc_u32 s4, s11, s9 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s5, s6, s5 +; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s9 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s5, s6, s5 +; GCN-NEXT: v_mul_hi_u32 v1, s4, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s5, 24 +; GCN-NEXT: s_mul_i32 s5, s5, 24 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_mul_i32 s5, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_mul_i32 s10, s8, s4 -; GCN-NEXT: s_mul_i32 s11, s6, s4 -; GCN-NEXT: s_add_i32 s5, s12, s5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 -; GCN-NEXT: s_add_i32 s5, s5, s10 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 -; GCN-NEXT: v_readfirstlane_b32 s10, v3 -; GCN-NEXT: v_mul_hi_u32 v1, v1, s5 -; GCN-NEXT: s_mul_i32 s13, s4, s5 -; GCN-NEXT: s_add_u32 s10, s10, s13 -; GCN-NEXT: v_readfirstlane_b32 s13, v0 -; GCN-NEXT: s_mul_i32 s11, s9, s11 -; GCN-NEXT: s_addc_u32 s13, 0, s13 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_add_u32 s10, s10, s11 -; GCN-NEXT: v_readfirstlane_b32 s14, v1 -; GCN-NEXT: s_addc_u32 s10, s13, s12 -; GCN-NEXT: s_addc_u32 s11, s14, 0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_add_u32 s5, s10, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s11 -; GCN-NEXT: s_add_u32 s11, s4, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s9, s9, s10 -; GCN-NEXT: s_mul_i32 s4, s6, s9 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s8, s8, s11 -; GCN-NEXT: s_mul_i32 s5, s6, s11 -; GCN-NEXT: s_add_i32 s4, s4, s8 -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 -; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: s_mul_i32 s8, s11, s4 -; GCN-NEXT: v_readfirstlane_b32 s12, v2 -; GCN-NEXT: s_add_u32 s8, s12, s8 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mul_i32 s5, s9, s5 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: v_readfirstlane_b32 s6, v3 ; GCN-NEXT: s_add_u32 s5, s8, s5 -; GCN-NEXT: s_addc_u32 s5, s10, s6 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: s_addc_u32 s6, s6, 0 -; GCN-NEXT: s_mul_i32 s4, s9, s4 -; GCN-NEXT: s_add_u32 s4, s5, s4 -; GCN-NEXT: s_addc_u32 s6, 0, s6 -; GCN-NEXT: s_add_u32 s8, s11, s4 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_addc_u32 s4, s9, s6 -; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 -; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 -; GCN-NEXT: s_mul_i32 s4, s4, 24 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s8, 0, s5 +; GCN-NEXT: s_addc_u32 s8, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 @@ -899,11 +882,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mul_i32 s0, s2, s8 ; GCN-NEXT: s_sub_u32 s11, 24, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s8, s0, s1 ; GCN-NEXT: s_subb_u32 s12, s9, s3 ; GCN-NEXT: s_sub_u32 s13, s11, s2 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 ; GCN-NEXT: s_subb_u32 s14, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s14, s3 ; GCN-NEXT: s_cselect_b32 s15, -1, 0 @@ -912,13 +893,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_eq_u32 s14, s3 ; GCN-NEXT: s_cselect_b32 s15, s16, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s3 -; GCN-NEXT: s_sub_u32 s16, s13, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_subb_u32 s8, s12, s3 +; GCN-NEXT: s_sub_u32 s9, s13, s2 +; GCN-NEXT: s_subb_u32 s8, s8, 0 ; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 +; GCN-NEXT: s_cselect_b32 s9, s9, s13 ; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_subb_u32 s0, 0, s10 @@ -931,6 +910,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_cmp_lg_u32 s1, 0 ; GCN-NEXT: s_cselect_b32 s0, s8, s0 ; GCN-NEXT: s_cselect_b32 s1, s9, s11 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -956,8 +936,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -988,8 +966,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s10, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s17 ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 -; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GCN-IR-NEXT: s_or_b32 s16, s16, s17 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1077,8 +1053,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 -; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-IR-NEXT: s_or_b32 s6, s6, s7 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1106,8 +1080,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s8, s8, s10 ; GCN-IR-NEXT: s_subb_u32 s9, s9, 0 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-IR-NEXT: s_or_b32 s14, s14, s15 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index e8db6471b6a46..8a54ad301f48a 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -15,10 +15,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_sub_u32 s2, s2, s8 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: s_subb_u32 s3, s3, s9 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -432,8 +430,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_u32 s4, s4, s6 -; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 -; SI-NEXT: s_or_b32 s6, s12, s13 ; SI-NEXT: s_subb_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 From 8f79d82850d3a423649673beee024e7ce84e98ae Mon Sep 17 00:00:00 2001 From: John Lu Date: Mon, 27 Oct 2025 11:09:27 -0500 Subject: [PATCH 02/15] Fix mnemonic Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 305c9c40ab726..91df365072521 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10689,7 +10689,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!optimizeSCC(Def, &CmpInstr, RI)) return false; - // If s_or_32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of + // If s_or_b32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of // a register pair) and the input is a 64-bit foldableSelect then transform: // // (s_or_b32 (S_CSELECT_B64 (non-zero imm), 0), 0 => (S_CSELECT_B64 From a276310d79c6b6ac5cf8ad096c6ddd9ca3fe2276 Mon Sep 17 00:00:00 2001 From: John Lu Date: Fri, 31 Oct 2025 23:20:59 -0500 Subject: [PATCH 03/15] Fix arguments Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 91df365072521..e37dea66ae031 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10704,8 +10704,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, OrOpnd1.getReg() != OrOpnd2.getReg()) { auto *Def1 = getVRegSubRegDef(getRegSubRegPair(OrOpnd1), *MRI); auto *Def2 = getVRegSubRegDef(getRegSubRegPair(OrOpnd2), *MRI); - if (Def1 == Def2 && foldableSelect(Def1)) - optimizeSCC(Def1, Def); + if (Def1 == Def2 && foldableSelect(*Def1)) + optimizeSCC(Def1, Def, RI); } } return true; From f261b26a4d99c26e4a7e737e4add771b65d7e931 Mon Sep 17 00:00:00 2001 From: John Lu Date: Fri, 31 Oct 2025 23:29:00 -0500 Subject: [PATCH 04/15] Fix comment Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e37dea66ae031..db6c58cbaabed 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10690,11 +10690,11 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; // If s_or_b32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of - // a register pair) and the input is a 64-bit foldableSelect then transform: + // a register pair) and the inputs are the hi and lo-halves of a 64-bit + // foldableSelect then transform: // - // (s_or_b32 (S_CSELECT_B64 (non-zero imm), 0), 0 => (S_CSELECT_B64 - // (non-zero - // imm), 0) + // (s_or_b32 [hi and lo (S_CSELECT_B64 (non-zero imm), 0)]) => + // (S_CSELECT_B64 (non-zero imm), 0) if (Def->getOpcode() == AMDGPU::S_OR_B32 && MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { MachineOperand OrOpnd1 = Def->getOperand(1); From b4f3449a2da8daee663e7f57cc1fc9aea95f2444 Mon Sep 17 00:00:00 2001 From: John Lu Date: Sat, 1 Nov 2025 13:01:56 -0500 Subject: [PATCH 05/15] Don't copy MachineOperand Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index db6c58cbaabed..27281b1ea1355 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10697,8 +10697,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // (S_CSELECT_B64 (non-zero imm), 0) if (Def->getOpcode() == AMDGPU::S_OR_B32 && MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { - MachineOperand OrOpnd1 = Def->getOperand(1); - MachineOperand OrOpnd2 = Def->getOperand(2); + const MachineOperand &OrOpnd1 = Def->getOperand(1); + const MachineOperand &OrOpnd2 = Def->getOperand(2); if (OrOpnd1.isReg() && OrOpnd2.isReg() && OrOpnd1.getReg() != OrOpnd2.getReg()) { From b3db724b5bbcea25ecaaea5cd3bb533d2907392f Mon Sep 17 00:00:00 2001 From: John Lu Date: Sat, 1 Nov 2025 13:08:01 -0500 Subject: [PATCH 06/15] Make comment clear Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 27281b1ea1355..5b420533f6804 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10689,12 +10689,13 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!optimizeSCC(Def, &CmpInstr, RI)) return false; - // If s_or_b32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of - // a register pair) and the inputs are the hi and lo-halves of a 64-bit - // foldableSelect then transform: - // - // (s_or_b32 [hi and lo (S_CSELECT_B64 (non-zero imm), 0)]) => - // (S_CSELECT_B64 (non-zero imm), 0) + // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit + // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a + // 64-bit foldableSelect then transform: + // s_cselect_b64 sX, (non-zero imm), 0 + // s_or_b32 sY, hi(sX), lo(sX) + // to: + // s_cselect_b64 sX, (non-zero imm), 0 if (Def->getOpcode() == AMDGPU::S_OR_B32 && MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { const MachineOperand &OrOpnd1 = Def->getOperand(1); From 57d9eddbb9e0b595dbd57973b5c02d027a188581 Mon Sep 17 00:00:00 2001 From: John Lu Date: Mon, 3 Nov 2025 10:43:26 -0600 Subject: [PATCH 07/15] Ensure hi/lo halves are or-ed. Add mir tests. Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 28 +++--- llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 90 +++++++++++++++++++ 2 files changed, 107 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5b420533f6804..db00619b5cfeb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10691,22 +10691,28 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a - // 64-bit foldableSelect then transform: - // s_cselect_b64 sX, (non-zero imm), 0 - // s_or_b32 sY, hi(sX), lo(sX) - // to: - // s_cselect_b64 sX, (non-zero imm), 0 + // 64-bit foldableSelect then delete s_or_b32 in the sequence: + // sX = s_cselect_b64 (non-zero imm), 0 + // sLo = copy sX.sub0 + // sHi = copy sX.sub1 + // sY = s_or_b32 sLo, sHi if (Def->getOpcode() == AMDGPU::S_OR_B32 && MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { const MachineOperand &OrOpnd1 = Def->getOperand(1); const MachineOperand &OrOpnd2 = Def->getOperand(2); - - if (OrOpnd1.isReg() && OrOpnd2.isReg() && - OrOpnd1.getReg() != OrOpnd2.getReg()) { - auto *Def1 = getVRegSubRegDef(getRegSubRegPair(OrOpnd1), *MRI); - auto *Def2 = getVRegSubRegDef(getRegSubRegPair(OrOpnd2), *MRI); - if (Def1 == Def2 && foldableSelect(*Def1)) + if (OrOpnd1.isReg() && OrOpnd2.isReg()) { + MachineInstr *Def1 = MRI->getUniqueVRegDef(OrOpnd1.getReg()); + MachineInstr *Def2 = MRI->getUniqueVRegDef(OrOpnd2.getReg()); + if (Def1->getOpcode() == AMDGPU::COPY && + Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && + Def2->getOperand(1).isReg() && + Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && + Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && + Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg() && + foldableSelect( + *MRI->getUniqueVRegDef(Def1->getOperand(1).getReg()))) { optimizeSCC(Def1, Def, RI); + } } } return true; diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index fba42c494343b..7538fab3f6069 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -2277,3 +2277,93 @@ body: | S_ENDPGM 0 ... + +--- +name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000 +body: | + ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000 + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]] + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1 + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + %0:vgpr_32 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + S_CMP_LG_U32 %2, 0, implicit-def $scc + %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec + %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec + %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc + S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... + +--- +# Do not delete s_or_b32 since both operands are sub1. +name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize +body: | + ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]] + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_CSELECT_B64_]].sub1 + ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + %0:vgpr_32 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + S_CMP_LG_U32 %2, 0, implicit-def $scc + %31:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc + %40:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec + %41:sreg_32 = COPY %31.sub1:sreg_64_xexec + %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32, implicit-def $scc + S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... From 919962bc0dca3e4142f4a7fdcc8a551d61878e7d Mon Sep 17 00:00:00 2001 From: John Lu Date: Tue, 4 Nov 2025 23:39:44 -0600 Subject: [PATCH 08/15] Add undef testcase Signed-off-by: John Lu --- llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index 7538fab3f6069..0fe665551b9b6 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -2367,3 +2367,44 @@ body: | S_ENDPGM 0 ... + +--- +name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000 +body: | + ; GCN-LABEL: name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000 + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]] + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 undef %4:sreg_32_xm0_xexec, undef %5:sreg_32_xm0_xexec, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + %0:vgpr_32 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + S_CMP_LG_U32 %2, 0, implicit-def $scc + %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %sgpr4:sreg_32 = S_OR_B32 undef %40:sreg_32_xm0_xexec, undef %41:sreg_32_xm0_xexec, implicit-def $scc + S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... From 7dbdbe42a6505e51cc380c16ba4928fd24bcd6ca Mon Sep 17 00:00:00 2001 From: John Lu Date: Tue, 4 Nov 2025 23:42:18 -0600 Subject: [PATCH 09/15] Fix handling of undef Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index db00619b5cfeb..0c105e631f669 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10703,15 +10703,17 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (OrOpnd1.isReg() && OrOpnd2.isReg()) { MachineInstr *Def1 = MRI->getUniqueVRegDef(OrOpnd1.getReg()); MachineInstr *Def2 = MRI->getUniqueVRegDef(OrOpnd2.getReg()); - if (Def1->getOpcode() == AMDGPU::COPY && - Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && + if (Def1 && Def1->getOpcode() == AMDGPU::COPY && + Def2 && Def2->getOpcode() == AMDGPU::COPY && + Def1->getOperand(1).isReg() && Def2->getOperand(1).isReg() && Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && - Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg() && - foldableSelect( - *MRI->getUniqueVRegDef(Def1->getOperand(1).getReg()))) { - optimizeSCC(Def1, Def, RI); + Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { + MachineInstr *Select = MRI->getUniqueVRegDef(Def1->getOperand(1).getReg()); + if (Select && foldableSelect(*Select)) { + optimizeSCC(Def1, Def, RI); + } } } } From 92f73c632a12ada133f085b363df1525b6443b4a Mon Sep 17 00:00:00 2001 From: John Lu Date: Tue, 4 Nov 2025 23:55:54 -0600 Subject: [PATCH 10/15] Use getVRegDef Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0c105e631f669..d03a916344ce5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10669,7 +10669,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (CmpValue != 0) return false; - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); + MachineInstr *Def = MRI->getVRegDef(SrcReg); if (!Def || Def->getParent() != CmpInstr.getParent()) return false; @@ -10701,19 +10701,18 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, const MachineOperand &OrOpnd1 = Def->getOperand(1); const MachineOperand &OrOpnd2 = Def->getOperand(2); if (OrOpnd1.isReg() && OrOpnd2.isReg()) { - MachineInstr *Def1 = MRI->getUniqueVRegDef(OrOpnd1.getReg()); - MachineInstr *Def2 = MRI->getUniqueVRegDef(OrOpnd2.getReg()); - if (Def1 && Def1->getOpcode() == AMDGPU::COPY && - Def2 && Def2->getOpcode() == AMDGPU::COPY && - Def1->getOperand(1).isReg() && + MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg()); + MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg()); + if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 && + Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && Def2->getOperand(1).isReg() && Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { - MachineInstr *Select = MRI->getUniqueVRegDef(Def1->getOperand(1).getReg()); - if (Select && foldableSelect(*Select)) { - optimizeSCC(Def1, Def, RI); - } + MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); + if (Select && foldableSelect(*Select)) { + optimizeSCC(Def1, Def, RI); + } } } } @@ -10746,7 +10745,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); + MachineInstr *Def = MRI->getVRegDef(SrcReg); if (!Def || Def->getParent() != CmpInstr.getParent()) return false; From 3f8d2092d54c1dd94f6f19f7318ee9d136df0d5f Mon Sep 17 00:00:00 2001 From: John Lu Date: Wed, 5 Nov 2025 07:55:29 -0600 Subject: [PATCH 11/15] Use correct instruction for scan start point Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d03a916344ce5..789b4ff17fd76 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10711,7 +10711,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); if (Select && foldableSelect(*Select)) { - optimizeSCC(Def1, Def, RI); + optimizeSCC(Select, Def, RI); } } } From d815e12654cff8f1f5fafa9d41be0f1e9be1fe3c Mon Sep 17 00:00:00 2001 From: John Lu Date: Wed, 5 Nov 2025 08:17:17 -0600 Subject: [PATCH 12/15] Ensure scan points have same parent Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 789b4ff17fd76..529ae48c3b213 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10625,6 +10625,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI) { MachineInstr *KillsSCC = nullptr; + if (SCCValid->getParent() != SCCRedefine->getParent()) + return false; for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), SCCRedefine->getIterator())) { if (MI.modifiesRegister(AMDGPU::SCC, &RI)) @@ -10670,7 +10672,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; MachineInstr *Def = MRI->getVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + if (!Def) return false; // For S_OP that set SCC = DST!=0, do the transformation @@ -10746,7 +10748,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n MachineInstr *Def = MRI->getVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + if (!Def) return false; if (Def->getOpcode() != AMDGPU::S_AND_B32 && From 8889e21f4ad2f4c5e96b7d7a9ef9ebf531af0530 Mon Sep 17 00:00:00 2001 From: John Lu Date: Wed, 5 Nov 2025 08:26:03 -0600 Subject: [PATCH 13/15] Add negative test for intervening scc def Signed-off-by: John Lu --- llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index 0fe665551b9b6..5b71482439fb7 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -2320,6 +2320,53 @@ body: | bb.2: S_ENDPGM 0 +... +--- +# Do not delete s_or_b32 since because of intervening def of scc +name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening +body: | + ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]] + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1 + ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + %0:vgpr_32 = IMPLICIT_DEF + %2:sreg_32 = COPY %0 + S_CMP_LG_U32 %2, 0, implicit-def $scc + %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + S_CMP_LG_U32 %2, 0, implicit-def $scc + %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec + %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec + %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc + S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + ... --- From f2e16ad4e8236bc35d69a7432c2009585a09e5c0 Mon Sep 17 00:00:00 2001 From: John Lu Date: Wed, 5 Nov 2025 08:37:31 -0600 Subject: [PATCH 14/15] Fix grammar Signed-off-by: John Lu --- llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index 5b71482439fb7..fa452f3717f0e 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -2322,7 +2322,7 @@ body: | ... --- -# Do not delete s_or_b32 since because of intervening def of scc +# Do not delete s_or_b32 because of intervening def of scc name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening body: | ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening From 9cea63d7b8e3cf8e7ba8abe5aa848202ffa9b63d Mon Sep 17 00:00:00 2001 From: John Lu Date: Wed, 5 Nov 2025 10:52:09 -0600 Subject: [PATCH 15/15] remove braces Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 529ae48c3b213..1920be1df6127 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10712,9 +10712,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); - if (Select && foldableSelect(*Select)) { + if (Select && foldableSelect(*Select)) optimizeSCC(Select, Def, RI); - } } } }