diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fdd54c42..def8ba47c9c37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1301,6 +1301,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I, if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) Ops.push_back(&Op); + + // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors) + // will be optimized away, and sinking them can help SDAG combines. + DataLayout DL = I->getModule()->getDataLayout(); + auto IsFreeExtractInsert = [&DL, this](VectorType *VecType, + unsigned VecIndex) { + unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType()); + return EltSize >= 32 || + (EltSize == 16 && VecIndex == 0 && ST->has16BitInsts()); + }; + + uint64_t VecIndex; + Value *Vec; + if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) { + Instruction *VecOpInst = + dyn_cast(cast(Op.get())->getOperand(0)); + // If a zero cost extractvector instruction is the only use of the vector, + // then it may be combined with the def. + if (VecOpInst && VecOpInst->hasOneUse()) + continue; + + if (IsFreeExtractInsert(cast(Vec->getType()), VecIndex)) + Ops.push_back(&Op); + + continue; + } + + if (match(Op.get(), + m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) { + if (IsFreeExtractInsert(cast(Vec->getType()), VecIndex)) + Ops.push_back(&Op); + + continue; + } + + if (auto *Shuffle = dyn_cast(Op.get())) { + if (Shuffle->isIdentity()) { + Ops.push_back(&Op); + continue; + } + + unsigned EltSize = DL.getTypeSizeInBits( + cast(cast(Shuffle->getType())) + ->getElementType()); + + // For i32 (or greater) shufflevectors, these will be lowered into a + // series of insert / extract elements, which will be coalesced away. + if (EltSize >= 32) { + Ops.push_back(&Op); + continue; + } + + if (EltSize < 16 || !ST->has16BitInsts()) + continue; + + int NumSubElts, SubIndex; + if (Shuffle->changesLength()) { + if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) { + Ops.push_back(&Op); + continue; + } + + if (Shuffle->isExtractSubvectorMask(SubIndex) || + Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) { + if (!(SubIndex % 2)) { + Ops.push_back(&Op); + continue; + } + } + } + + if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() || + Shuffle->isSingleSource()) { + Ops.push_back(&Op); + continue; + } + + if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) { + if (!(SubIndex % 2)) { + Ops.push_back(&Op); + continue; + } + } + } } return !Ops.empty(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 302b2395642d0..74b31913abb7e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -2149,11 +2149,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccz .LBB11_2 ; CI-NEXT: ; %bb.1: ; %frem.else ; CI-NEXT: s_and_b32 s6, s2, 0x80000000 -; CI-NEXT: v_mov_b32_e32 v1, s4 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| -; CI-NEXT: v_mov_b32_e32 v1, s6 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0| +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: .LBB11_2: ; %Flow53 ; CI-NEXT: s_xor_b32 s6, s6, 1 @@ -2224,11 +2224,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccz .LBB11_10 ; CI-NEXT: ; %bb.9: ; %frem.else16 ; CI-NEXT: s_and_b32 s6, s3, 0x80000000 -; CI-NEXT: v_mov_b32_e32 v2, s5 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2| -; CI-NEXT: v_mov_b32_e32 v2, s6 -; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1| +; CI-NEXT: v_mov_b32_e32 v1, s6 +; CI-NEXT: v_mov_b32_e32 v2, s3 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: .LBB11_10: ; %Flow49 ; CI-NEXT: s_xor_b32 s6, s6, 1 @@ -2322,11 +2322,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccz .LBB11_2 ; VI-NEXT: ; %bb.1: ; %frem.else ; VI-NEXT: s_and_b32 s6, s2, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1| -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0| +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-NEXT: s_mov_b32 s6, 0 ; VI-NEXT: .LBB11_2: ; %Flow53 ; VI-NEXT: s_xor_b32 s6, s6, 1 @@ -2397,11 +2397,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccz .LBB11_10 ; VI-NEXT: ; %bb.9: ; %frem.else16 ; VI-NEXT: s_and_b32 s6, s3, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2| -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1| +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: s_mov_b32 s6, 0 ; VI-NEXT: .LBB11_10: ; %Flow49 ; VI-NEXT: s_xor_b32 s6, s6, 1 @@ -2503,11 +2503,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccz .LBB12_2 ; CI-NEXT: ; %bb.1: ; %frem.else ; CI-NEXT: s_and_b32 s2, s4, 0x80000000 -; CI-NEXT: v_mov_b32_e32 v1, s8 -; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1| -; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0| +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: .LBB12_2: ; %Flow127 ; CI-NEXT: s_xor_b32 s2, s2, 1 @@ -2578,11 +2578,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccz .LBB12_10 ; CI-NEXT: ; %bb.9: ; %frem.else16 ; CI-NEXT: s_and_b32 s2, s5, 0x80000000 -; CI-NEXT: v_mov_b32_e32 v2, s9 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2| -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1| +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v2, s5 +; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: .LBB12_10: ; %Flow123 ; CI-NEXT: s_xor_b32 s2, s2, 1 @@ -2653,11 +2653,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccz .LBB12_18 ; CI-NEXT: ; %bb.17: ; %frem.else47 ; CI-NEXT: s_and_b32 s2, s6, 0x80000000 -; CI-NEXT: v_mov_b32_e32 v3, s10 -; CI-NEXT: v_mov_b32_e32 v2, s6 -; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3| -; CI-NEXT: v_mov_b32_e32 v3, s2 -; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CI-NEXT: v_mov_b32_e32 v2, s10 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2| +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s6 +; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: .LBB12_18: ; %Flow119 ; CI-NEXT: s_xor_b32 s2, s2, 1 @@ -2728,11 +2728,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccz .LBB12_26 ; CI-NEXT: ; %bb.25: ; %frem.else78 ; CI-NEXT: s_and_b32 s2, s7, 0x80000000 -; CI-NEXT: v_mov_b32_e32 v4, s11 -; CI-NEXT: v_mov_b32_e32 v3, s7 -; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4| -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CI-NEXT: v_mov_b32_e32 v3, s11 +; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3| +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: v_mov_b32_e32 v4, s7 +; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: .LBB12_26: ; %Flow115 ; CI-NEXT: s_xor_b32 s2, s2, 1 @@ -2834,11 +2834,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %frem.else ; VI-NEXT: s_and_b32 s2, s4, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1| -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0| +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: .LBB12_2: ; %Flow127 ; VI-NEXT: s_xor_b32 s2, s2, 1 @@ -2909,11 +2909,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccz .LBB12_10 ; VI-NEXT: ; %bb.9: ; %frem.else16 ; VI-NEXT: s_and_b32 s2, s5, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v2, s9 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2| -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1| +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: .LBB12_10: ; %Flow123 ; VI-NEXT: s_xor_b32 s2, s2, 1 @@ -2984,11 +2984,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccz .LBB12_18 ; VI-NEXT: ; %bb.17: ; %frem.else47 ; VI-NEXT: s_and_b32 s2, s6, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v3, s10 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3| -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2| +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: .LBB12_18: ; %Flow119 ; VI-NEXT: s_xor_b32 s2, s2, 1 @@ -3059,11 +3059,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccz .LBB12_26 ; VI-NEXT: ; %bb.25: ; %frem.else78 ; VI-NEXT: s_and_b32 s2, s7, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v4, s11 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4| -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3| +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: .LBB12_26: ; %Flow115 ; VI-NEXT: s_xor_b32 s2, s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 78a961ea0da17..d75d2597685d6 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -5783,11 +5783,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v5, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_3 ; GFX11-TRUE16-NEXT: s_branch .LBB9_8 @@ -6221,12 +6221,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s7 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB9_3 ; GFX1150-TRUE16-NEXT: s_branch .LBB9_8 @@ -6691,12 +6691,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s7 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB9_3 ; GFX1200-TRUE16-NEXT: s_branch .LBB9_8 @@ -8964,11 +8964,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v7, v4 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_3 ; GFX11-TRUE16-NEXT: s_branch .LBB10_8 @@ -9805,12 +9805,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_3 ; GFX1150-TRUE16-NEXT: s_branch .LBB10_8 @@ -10713,12 +10713,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_3 ; GFX1200-TRUE16-NEXT: s_branch .LBB10_8 @@ -12714,18 +12714,18 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1150-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v2, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_readfirstlane_b32 s5, v1 -; GFX1150-NEXT: global_load_b64 v[1:2], v2, s[6:7] offset:32 ; GFX1150-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1150-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1150-NEXT: global_load_b64 v[0:1], v2, s[8:9] offset:32 ; GFX1150-NEXT: s_and_b32 s3, s6, 0x7fffffff ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1150-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1150-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1150-NEXT: s_and_b32 s8, s4, 0x7fffffff ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s3, s8 @@ -12933,232 +12933,221 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-LABEL: frem_v2f32: ; GFX1200: ; %bb.0: ; GFX1200-NEXT: s_clause 0x1 -; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX1200-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1200-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_readfirstlane_b32 s5, v1 -; GFX1200-NEXT: global_load_b64 v[1:2], v2, s[6:7] offset:32 -; GFX1200-NEXT: v_readfirstlane_b32 s6, v0 -; GFX1200-NEXT: s_and_b32 s3, s6, 0x7fffffff +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: global_load_b64 v[2:3], v0, s[10:11] +; GFX1200-NEXT: global_load_b64 v[0:1], v0, s[0:1] offset:32 +; GFX1200-NEXT: s_wait_loadcnt 0x1 +; GFX1200-NEXT: v_and_b32_e32 v4, 0x7fffffff, v2 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1200-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1200-NEXT: s_and_b32 s8, s4, 0x7fffffff -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1200-NEXT: s_cmp_ngt_f32 s3, s8 -; GFX1200-NEXT: s_cbranch_scc0 .LBB11_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else -; GFX1200-NEXT: s_cmp_eq_f32 s3, s8 -; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6 -; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo +; GFX1200-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v5 +; GFX1200-NEXT: s_cbranch_vccz .LBB11_2 +; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: v_bfi_b32 v6, 0x7fffffff, 0, v2 +; GFX1200-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo ; GFX1200-NEXT: s_cbranch_execz .LBB11_3 ; GFX1200-NEXT: s_branch .LBB11_8 ; GFX1200-NEXT: .LBB11_2: -; GFX1200-NEXT: ; implicit-def: $vgpr0 +; GFX1200-NEXT: ; implicit-def: $vgpr5 ; GFX1200-NEXT: .LBB11_3: ; %frem.compute -; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s4| -; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s6| -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v6, |v0| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v5, |v2| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v8, v2 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_ldexp_f32 v1, v1, 1 -; GFX1200-NEXT: v_ldexp_f32 v2, v0, 12 -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v0, s4 +; GFX1200-NEXT: v_ldexp_f32 v6, v6, 1 +; GFX1200-NEXT: v_ldexp_f32 v7, v5, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v5, v0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-NEXT: v_readfirstlane_b32 s7, v3 -; GFX1200-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 +; GFX1200-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1200-NEXT: v_div_scale_f32 v10, null, v6, v6, 1.0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1200-NEXT: v_add_nc_u32_e32 v0, -1, v0 -; GFX1200-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1200-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1200-NEXT: v_add_nc_u32_e32 v5, -1, v5 +; GFX1200-NEXT: v_rcp_f32_e32 v11, v10 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_not_b32_e32 v4, v0 -; GFX1200-NEXT: v_add_nc_u32_e32 v4, v4, v3 -; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX1200-NEXT: v_not_b32_e32 v9, v5 +; GFX1200-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX1200-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0 ; GFX1200-NEXT: s_denorm_mode 15 ; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1200-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v11 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX1200-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1200-NEXT: v_mul_f32_e32 v12, v8, v11 +; GFX1200-NEXT: v_fma_f32 v13, -v10, v12, v8 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v6 -; GFX1200-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX1200-NEXT: v_fma_f32 v8, -v10, v12, v8 ; GFX1200-NEXT: s_denorm_mode 12 ; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v6, v7 -; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 -; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 +; GFX1200-NEXT: v_div_fmas_f32 v8, v8, v11, v12 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v9 +; GFX1200-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB11_7 ; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader -; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s7, s7, 12 +; GFX1200-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_add_co_i32 s0, s0, 12 ; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_mov_b32_e32 v5, v2 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s7, s7, -12 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_gt_i32 s7, 12 -; GFX1200-NEXT: v_mul_f32_e32 v2, v5, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: v_mov_b32_e32 v10, v7 +; GFX1200-NEXT: s_add_co_i32 s0, s0, -12 +; GFX1200-NEXT: s_cmp_gt_i32 s0, 12 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_rndne_f32_e32 v2, v2 -; GFX1200-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX1200-NEXT: v_mul_f32_e32 v7, v10, v8 +; GFX1200-NEXT: v_rndne_f32_e32 v7, v7 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v2, v2, v1, v5 -; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 -; GFX1200-NEXT: v_add_f32_e32 v4, v2, v1 +; GFX1200-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 +; GFX1200-NEXT: v_fma_f32 v7, v7, v6, v10 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX1200-NEXT: v_add_f32_e32 v9, v7, v6 ; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX1200-NEXT: v_ldexp_f32 v2, v2, 12 +; GFX1200-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v7, v7, 12 ; GFX1200-NEXT: s_cbranch_scc1 .LBB11_5 ; GFX1200-NEXT: ; %bb.6: ; %Flow51 -; GFX1200-NEXT: v_mov_b32_e32 v4, s7 -; GFX1200-NEXT: v_mov_b32_e32 v2, v5 +; GFX1200-NEXT: v_mov_b32_e32 v9, s0 +; GFX1200-NEXT: v_mov_b32_e32 v7, v10 ; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4 -; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX1200-NEXT: v_add_nc_u32_e32 v9, -11, v9 +; GFX1200-NEXT: v_ldexp_f32 v7, v7, v9 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX1200-NEXT: v_rndne_f32_e32 v3, v3 +; GFX1200-NEXT: v_mul_f32_e32 v8, v7, v8 +; GFX1200-NEXT: v_rndne_f32_e32 v8, v8 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX1200-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX1200-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v6 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 -; GFX1200-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v7 +; GFX1200-NEXT: v_add_f32_e32 v6, v7, v6 ; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f32 v0, v1, v0 -; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s6 +; GFX1200-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX1200-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v2 ; GFX1200-NEXT: .LBB11_8: -; GFX1200-NEXT: s_and_b32 s6, s5, 0x7fffffff -; GFX1200-NEXT: s_and_b32 s8, s2, 0x7fffffff -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_ngt_f32 s6, s8 -; GFX1200-NEXT: s_cbranch_scc0 .LBB11_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else16 -; GFX1200-NEXT: s_cmp_eq_f32 s6, s8 -; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5 -; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_and_b32_e32 v2, 0x7fffffff, v3 +; GFX1200-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo +; GFX1200-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v2, v6 +; GFX1200-NEXT: s_cbranch_vccz .LBB11_10 +; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: v_bfi_b32 v7, 0x7fffffff, 0, v3 +; GFX1200-NEXT: v_cmp_eq_f32_e32 vcc_lo, v2, v6 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-NEXT: v_cndmask_b32_e32 v6, v3, v7, vcc_lo ; GFX1200-NEXT: s_cbranch_execz .LBB11_11 ; GFX1200-NEXT: s_branch .LBB11_16 ; GFX1200-NEXT: .LBB11_10: -; GFX1200-NEXT: ; implicit-def: $vgpr1 +; GFX1200-NEXT: ; implicit-def: $vgpr6 ; GFX1200-NEXT: .LBB11_11: ; %frem.compute15 -; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s2| -; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s5| -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s5 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v7, |v1| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v6, |v3| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v9, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_ldexp_f32 v2, v2, 1 -; GFX1200-NEXT: v_ldexp_f32 v3, v1, 12 -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v1, s2 +; GFX1200-NEXT: v_ldexp_f32 v7, v7, 1 +; GFX1200-NEXT: v_ldexp_f32 v8, v6, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 -; GFX1200-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 +; GFX1200-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1200-NEXT: v_div_scale_f32 v11, null, v7, v7, 1.0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1200-NEXT: v_add_nc_u32_e32 v1, -1, v1 -; GFX1200-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1200-NEXT: v_readfirstlane_b32 s1, v6 +; GFX1200-NEXT: v_add_nc_u32_e32 v6, -1, v6 +; GFX1200-NEXT: v_rcp_f32_e32 v12, v11 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_not_b32_e32 v5, v1 -; GFX1200-NEXT: v_add_nc_u32_e32 v5, v5, v4 -; GFX1200-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1200-NEXT: v_not_b32_e32 v10, v6 +; GFX1200-NEXT: v_add_nc_u32_e32 v10, v10, v9 +; GFX1200-NEXT: v_div_scale_f32 v9, vcc_lo, 1.0, v7, 1.0 ; GFX1200-NEXT: s_denorm_mode 15 ; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1200-NEXT: v_fma_f32 v13, -v11, v12, 1.0 +; GFX1200-NEXT: v_fmac_f32_e32 v12, v13, v12 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v8, v4, v7 -; GFX1200-NEXT: v_fma_f32 v9, -v6, v8, v4 +; GFX1200-NEXT: v_mul_f32_e32 v13, v9, v12 +; GFX1200-NEXT: v_fma_f32 v14, -v11, v13, v9 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v7 -; GFX1200-NEXT: v_fma_f32 v4, -v6, v8, v4 +; GFX1200-NEXT: v_fmac_f32_e32 v13, v14, v12 +; GFX1200-NEXT: v_fma_f32 v9, -v11, v13, v9 ; GFX1200-NEXT: s_denorm_mode 12 ; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-NEXT: v_div_fmas_f32 v4, v4, v7, v8 -; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 -; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 +; GFX1200-NEXT: v_div_fmas_f32 v9, v9, v12, v13 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v10 +; GFX1200-NEXT: v_div_fixup_f32 v9, v9, v7, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB11_15 ; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader -; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8 +; GFX1200-NEXT: s_sub_co_i32 s0, s0, s1 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s7, s7, 12 +; GFX1200-NEXT: s_add_co_i32 s0, s0, 12 ; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body23 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_mov_b32_e32 v6, v3 +; GFX1200-NEXT: v_mov_b32_e32 v11, v8 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s7, s7, -12 +; GFX1200-NEXT: s_add_co_i32 s0, s0, -12 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_gt_i32 s7, 12 -; GFX1200-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1200-NEXT: s_cmp_gt_i32 s0, 12 +; GFX1200-NEXT: v_mul_f32_e32 v8, v11, v9 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_rndne_f32_e32 v3, v3 -; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-NEXT: v_rndne_f32_e32 v8, v8 +; GFX1200-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v3, v3, v2, v6 -; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 -; GFX1200-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1200-NEXT: v_fma_f32 v8, v8, v7, v11 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v8 +; GFX1200-NEXT: v_add_f32_e32 v10, v8, v7 ; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX1200-NEXT: v_ldexp_f32 v3, v3, 12 +; GFX1200-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX1200-NEXT: v_ldexp_f32 v8, v8, 12 ; GFX1200-NEXT: s_cbranch_scc1 .LBB11_13 ; GFX1200-NEXT: ; %bb.14: ; %Flow -; GFX1200-NEXT: v_mov_b32_e32 v5, s7 -; GFX1200-NEXT: v_mov_b32_e32 v3, v6 +; GFX1200-NEXT: v_mov_b32_e32 v10, s0 +; GFX1200-NEXT: v_mov_b32_e32 v8, v11 ; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit24 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5 -; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5 +; GFX1200-NEXT: v_add_nc_u32_e32 v10, -11, v10 +; GFX1200-NEXT: v_ldexp_f32 v8, v8, v10 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX1200-NEXT: v_rndne_f32_e32 v4, v4 +; GFX1200-NEXT: v_mul_f32_e32 v9, v8, v9 +; GFX1200-NEXT: v_rndne_f32_e32 v9, v9 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX1200-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1200-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 +; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v7 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 -; GFX1200-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v8 +; GFX1200-NEXT: v_add_f32_e32 v7, v8, v7 ; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc_lo ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s5 +; GFX1200-NEXT: v_ldexp_f32 v6, v7, v6 +; GFX1200-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, v3 ; GFX1200-NEXT: .LBB11_16: ; %Flow50 -; GFX1200-NEXT: s_cmp_lg_f32 s4, 0 -; GFX1200-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1200-NEXT: s_cmp_nge_f32 s3, 0x7f800000 -; GFX1200-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_and_b32 vcc_lo, s3, s4 -; GFX1200-NEXT: s_cmp_lg_f32 s2, 0 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo -; GFX1200-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1200-NEXT: s_cmp_nge_f32 s6, 0x7f800000 -; GFX1200-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1200-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v0 +; GFX1200-NEXT: v_cmp_nle_f32_e64 s0, 0x7f800000, v4 +; GFX1200-NEXT: v_mov_b32_e32 v3, 0 +; GFX1200-NEXT: s_and_b32 vcc_lo, s0, vcc_lo +; GFX1200-NEXT: v_cmp_nle_f32_e64 s0, 0x7f800000, v2 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_and_b32 vcc_lo, s3, s2 +; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v5, vcc_lo +; GFX1200-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0, v1 +; GFX1200-NEXT: s_and_b32 vcc_lo, s0, vcc_lo ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7fc00000, v1 -; GFX1200-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v6, vcc_lo +; GFX1200-NEXT: global_store_b64 v3, v[0:1], s[8:9] ; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 @@ -15155,23 +15144,23 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1150-NEXT: v_readfirstlane_b32 s10, v1 ; GFX1150-NEXT: v_readfirstlane_b32 s9, v2 ; GFX1150-NEXT: v_readfirstlane_b32 s7, v3 -; GFX1150-NEXT: global_load_b128 v[1:4], v4, s[4:5] offset:64 -; GFX1150-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1150-NEXT: s_and_b32 s5, s8, 0x7fffffff +; GFX1150-NEXT: global_load_b128 v[0:3], v4, s[4:5] offset:64 +; GFX1150-NEXT: s_and_b32 s6, s8, 0x7fffffff ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_readfirstlane_b32 s6, v1 -; GFX1150-NEXT: v_readfirstlane_b32 s4, v2 -; GFX1150-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1150-NEXT: v_readfirstlane_b32 s2, v4 -; GFX1150-NEXT: s_and_b32 s12, s6, 0x7fffffff +; GFX1150-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1150-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1150-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1150-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1150-NEXT: s_and_b32 s12, s5, 0x7fffffff ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1150-NEXT: s_cmp_ngt_f32 s5, s12 +; GFX1150-NEXT: s_cmp_ngt_f32 s6, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1150-NEXT: ; %bb.1: ; %frem.else -; GFX1150-NEXT: s_cmp_eq_f32 s5, s12 +; GFX1150-NEXT: s_cmp_eq_f32 s6, s12 ; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15181,13 +15170,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: .LBB12_2: ; GFX1150-NEXT: ; implicit-def: $vgpr0 ; GFX1150-NEXT: .LBB12_3: ; %frem.compute -; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s6| +; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s5| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s8| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1150-NEXT: v_ldexp_f32 v1, v1, 1 ; GFX1150-NEXT: v_ldexp_f32 v2, v0, 12 -; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v0, s6 +; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v0, s5 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1150-NEXT: v_readfirstlane_b32 s11, v3 ; GFX1150-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 @@ -15541,13 +15530,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_ldexp_f32 v3, v4, v3 ; GFX1150-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s7 ; GFX1150-NEXT: .LBB12_32: ; %Flow116 -; GFX1150-NEXT: s_cmp_lg_f32 s6, 0 +; GFX1150-NEXT: s_cmp_lg_f32 s5, 0 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 -; GFX1150-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1150-NEXT: s_cmp_nge_f32 s5, 0x7f800000 ; GFX1150-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1150-NEXT: s_cmp_nge_f32 s6, 0x7f800000 +; GFX1150-NEXT: s_cselect_b32 s6, -1, 0 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1150-NEXT: s_and_b32 vcc_lo, s5, s6 +; GFX1150-NEXT: s_and_b32 vcc_lo, s6, s5 ; GFX1150-NEXT: s_cmp_lg_f32 s4, 0 ; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX1150-NEXT: s_cselect_b32 s4, -1, 0 @@ -15582,148 +15571,49 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_readfirstlane_b32 s10, v1 -; GFX1200-NEXT: v_readfirstlane_b32 s9, v2 -; GFX1200-NEXT: v_readfirstlane_b32 s7, v3 +; GFX1200-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1200-NEXT: v_readfirstlane_b32 s8, v2 +; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 ; GFX1200-NEXT: global_load_b128 v[1:4], v4, s[4:5] offset:64 -; GFX1200-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1200-NEXT: s_and_b32 s5, s8, 0x7fffffff +; GFX1200-NEXT: v_readfirstlane_b32 s7, v0 ; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_readfirstlane_b32 s6, v1 ; GFX1200-NEXT: v_readfirstlane_b32 s4, v2 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1200-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0 +; GFX1200-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 +; GFX1200-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1200-NEXT: v_readfirstlane_b32 s2, v4 -; GFX1200-NEXT: s_and_b32 s12, s6, 0x7fffffff -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1200-NEXT: s_cmp_ngt_f32 s5, s12 -; GFX1200-NEXT: s_cbranch_scc0 .LBB12_2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1200-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v2, v3 +; GFX1200-NEXT: s_cbranch_vccz .LBB12_2 ; GFX1200-NEXT: ; %bb.1: ; %frem.else -; GFX1200-NEXT: s_cmp_eq_f32 s5, s12 -; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8 -; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo +; GFX1200-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, s7 +; GFX1200-NEXT: v_cmp_eq_f32_e32 vcc_lo, v2, v3 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-NEXT: v_cndmask_b32_e32 v3, s7, v4, vcc_lo ; GFX1200-NEXT: s_cbranch_execz .LBB12_3 ; GFX1200-NEXT: s_branch .LBB12_8 ; GFX1200-NEXT: .LBB12_2: -; GFX1200-NEXT: ; implicit-def: $vgpr0 +; GFX1200-NEXT: ; implicit-def: $vgpr3 ; GFX1200-NEXT: .LBB12_3: ; %frem.compute -; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s6| -; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s8| -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |v0| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v5, |v1| ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_ldexp_f32 v1, v1, 1 -; GFX1200-NEXT: v_ldexp_f32 v2, v0, 12 -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v0, s6 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1200-NEXT: v_div_scale_f32 v5, null, v1, v1, 1.0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_readfirstlane_b32 s12, v0 -; GFX1200-NEXT: v_add_nc_u32_e32 v0, -1, v0 -; GFX1200-NEXT: v_rcp_f32_e32 v6, v5 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_not_b32_e32 v4, v0 -; GFX1200-NEXT: v_add_nc_u32_e32 v4, v4, v3 -; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 -; GFX1200-NEXT: s_denorm_mode 15 -; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v6 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX1200-NEXT: v_fma_f32 v8, -v5, v7, v3 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v6 -; GFX1200-NEXT: v_fma_f32 v3, -v5, v7, v3 -; GFX1200-NEXT: s_denorm_mode 12 -; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v6, v7 -; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 -; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 -; GFX1200-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader -; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body -; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1200-NEXT: v_mov_b32_e32 v5, v2 -; GFX1200-NEXT: s_add_co_i32 s11, s11, -12 -; GFX1200-NEXT: s_cmp_gt_i32 s11, 12 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v2, v5, v3 -; GFX1200-NEXT: v_rndne_f32_e32 v2, v2 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX1200-NEXT: v_fma_f32 v2, v2, v1, v5 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 -; GFX1200-NEXT: v_add_f32_e32 v4, v2, v1 -; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f32 v2, v2, 12 -; GFX1200-NEXT: s_cbranch_scc1 .LBB12_5 -; GFX1200-NEXT: ; %bb.6: ; %Flow125 -; GFX1200-NEXT: v_mov_b32_e32 v4, s11 -; GFX1200-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX1200-NEXT: v_readfirstlane_b32 s10, v4 +; GFX1200-NEXT: v_ldexp_f32 v3, v0, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v0, v1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_ldexp_f32 v1, v5, 1 +; GFX1200-NEXT: v_readfirstlane_b32 s11, v0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4 -; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX1200-NEXT: v_rndne_f32_e32 v3, v3 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX1200-NEXT: v_fmac_f32_e32 v2, v3, v1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 -; GFX1200-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f32 v0, v1, v0 -; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s8 -; GFX1200-NEXT: .LBB12_8: -; GFX1200-NEXT: s_and_b32 s8, s10, 0x7fffffff -; GFX1200-NEXT: s_and_b32 s12, s4, 0x7fffffff -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_ngt_f32 s8, s12 -; GFX1200-NEXT: s_cbranch_scc0 .LBB12_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else16 -; GFX1200-NEXT: s_cmp_eq_f32 s8, s12 -; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10 -; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_cndmask_b32_e32 v1, s10, v1, vcc_lo -; GFX1200-NEXT: s_cbranch_execz .LBB12_11 -; GFX1200-NEXT: s_branch .LBB12_16 -; GFX1200-NEXT: .LBB12_10: -; GFX1200-NEXT: ; implicit-def: $vgpr1 -; GFX1200-NEXT: .LBB12_11: ; %frem.compute15 -; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s4| -; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s10| -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_ldexp_f32 v2, v2, 1 -; GFX1200-NEXT: v_ldexp_f32 v3, v1, 12 -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v1, s4 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-NEXT: v_readfirstlane_b32 s11, v4 -; GFX1200-NEXT: v_div_scale_f32 v6, null, v2, v2, 1.0 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_readfirstlane_b32 s12, v1 -; GFX1200-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX1200-NEXT: v_div_scale_f32 v6, null, v1, v1, 1.0 ; GFX1200-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1200-NEXT: v_add_nc_u32_e32 v0, -1, v0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_not_b32_e32 v5, v1 +; GFX1200-NEXT: v_not_b32_e32 v5, v0 ; GFX1200-NEXT: v_add_nc_u32_e32 v5, v5, v4 -; GFX1200-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0 +; GFX1200-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v1, 1.0 ; GFX1200-NEXT: s_denorm_mode 15 ; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_fma_f32 v8, -v6, v7, 1.0 @@ -15739,37 +15629,36 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_div_fmas_f32 v4, v4, v7, v8 ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 -; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 -; GFX1200-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader -; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v1, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB12_7 +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: s_sub_co_i32 s10, s10, s11 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_add_co_i32 s10, s10, 12 +; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: v_mov_b32_e32 v6, v3 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s11, s11, -12 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_gt_i32 s11, 12 -; GFX1200-NEXT: v_mul_f32_e32 v3, v6, v4 +; GFX1200-NEXT: s_add_co_i32 s10, s10, -12 +; GFX1200-NEXT: s_cmp_gt_i32 s10, 12 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v3, v6, v4 ; GFX1200-NEXT: v_rndne_f32_e32 v3, v3 -; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v3, v3, v2, v6 +; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1200-NEXT: v_fma_f32 v3, v3, v1, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 -; GFX1200-NEXT: v_add_f32_e32 v5, v3, v2 +; GFX1200-NEXT: v_add_f32_e32 v5, v3, v1 ; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_ldexp_f32 v3, v3, 12 -; GFX1200-NEXT: s_cbranch_scc1 .LBB12_13 -; GFX1200-NEXT: ; %bb.14: ; %Flow121 -; GFX1200-NEXT: v_mov_b32_e32 v5, s11 +; GFX1200-NEXT: s_cbranch_scc1 .LBB12_5 +; GFX1200-NEXT: ; %bb.6: ; %Flow125 +; GFX1200-NEXT: v_mov_b32_e32 v5, s10 ; GFX1200-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5 @@ -15778,51 +15667,51 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX1200-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX1200-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 -; GFX1200-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX1200-NEXT: v_add_f32_e32 v1, v3, v1 ; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s10 -; GFX1200-NEXT: .LBB12_16: -; GFX1200-NEXT: s_and_b32 s10, s9, 0x7fffffff -; GFX1200-NEXT: s_and_b32 s12, s3, 0x7fffffff +; GFX1200-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, v0, s7 +; GFX1200-NEXT: .LBB12_8: +; GFX1200-NEXT: s_and_b32 s7, s9, 0x7fffffff +; GFX1200-NEXT: s_and_b32 s11, s4, 0x7fffffff ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_ngt_f32 s10, s12 -; GFX1200-NEXT: s_cbranch_scc0 .LBB12_18 -; GFX1200-NEXT: ; %bb.17: ; %frem.else47 -; GFX1200-NEXT: s_cmp_eq_f32 s10, s12 -; GFX1200-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9 +; GFX1200-NEXT: s_cmp_ngt_f32 s7, s11 +; GFX1200-NEXT: s_cbranch_scc0 .LBB12_10 +; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: s_cmp_eq_f32 s7, s11 +; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s9 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_cndmask_b32_e32 v2, s9, v2, vcc_lo -; GFX1200-NEXT: s_cbranch_execz .LBB12_19 -; GFX1200-NEXT: s_branch .LBB12_24 -; GFX1200-NEXT: .LBB12_18: -; GFX1200-NEXT: ; implicit-def: $vgpr2 -; GFX1200-NEXT: .LBB12_19: ; %frem.compute46 -; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s3| -; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s9| +; GFX1200-NEXT: v_cndmask_b32_e32 v0, s9, v0, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB12_11 +; GFX1200-NEXT: s_branch .LBB12_16 +; GFX1200-NEXT: .LBB12_10: +; GFX1200-NEXT: ; implicit-def: $vgpr0 +; GFX1200-NEXT: .LBB12_11: ; %frem.compute15 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s4| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s9| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v5, s9 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_ldexp_f32 v3, v3, 1 -; GFX1200-NEXT: v_ldexp_f32 v4, v2, 12 -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v2, s3 +; GFX1200-NEXT: v_ldexp_f32 v1, v1, 1 +; GFX1200-NEXT: v_ldexp_f32 v4, v0, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v0, s4 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-NEXT: v_readfirstlane_b32 s11, v5 -; GFX1200-NEXT: v_div_scale_f32 v7, null, v3, v3, 1.0 +; GFX1200-NEXT: v_readfirstlane_b32 s10, v5 +; GFX1200-NEXT: v_div_scale_f32 v7, null, v1, v1, 1.0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_readfirstlane_b32 s12, v2 -; GFX1200-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1200-NEXT: v_readfirstlane_b32 s11, v0 +; GFX1200-NEXT: v_add_nc_u32_e32 v0, -1, v0 ; GFX1200-NEXT: v_rcp_f32_e32 v8, v7 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_not_b32_e32 v6, v2 +; GFX1200-NEXT: v_not_b32_e32 v6, v0 ; GFX1200-NEXT: v_add_nc_u32_e32 v6, v6, v5 -; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0 +; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0 ; GFX1200-NEXT: s_denorm_mode 15 ; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_fma_f32 v9, -v7, v8, 1.0 @@ -15838,37 +15727,37 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v8, v9 ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 -; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 -; GFX1200-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body54.preheader -; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 +; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v1, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB12_15 +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: s_sub_co_i32 s10, s10, s11 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX1200-NEXT: s_add_co_i32 s10, s10, 12 +; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body23 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v7, v4 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s11, s11, -12 +; GFX1200-NEXT: s_add_co_i32 s10, s10, -12 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_gt_i32 s11, 12 +; GFX1200-NEXT: s_cmp_gt_i32 s10, 12 ; GFX1200-NEXT: v_mul_f32_e32 v4, v7, v5 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_rndne_f32_e32 v4, v4 ; GFX1200-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_fma_f32 v4, v4, v3, v7 +; GFX1200-NEXT: v_fma_f32 v4, v4, v1, v7 ; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 -; GFX1200-NEXT: v_add_f32_e32 v6, v4, v3 +; GFX1200-NEXT: v_add_f32_e32 v6, v4, v1 ; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX1200-NEXT: v_ldexp_f32 v4, v4, 12 -; GFX1200-NEXT: s_cbranch_scc1 .LBB12_21 -; GFX1200-NEXT: ; %bb.22: ; %Flow117 -; GFX1200-NEXT: v_mov_b32_e32 v6, s11 +; GFX1200-NEXT: s_cbranch_scc1 .LBB12_13 +; GFX1200-NEXT: ; %bb.14: ; %Flow121 +; GFX1200-NEXT: v_mov_b32_e32 v6, s10 ; GFX1200-NEXT: v_mov_b32_e32 v4, v7 -; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit24 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v6, -11, v6 ; GFX1200-NEXT: v_ldexp_f32 v4, v4, v6 @@ -15877,49 +15766,49 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_rndne_f32_e32 v5, v5 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX1200-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX1200-NEXT: v_fmac_f32_e32 v4, v5, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 -; GFX1200-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-NEXT: v_add_f32_e32 v1, v4, v1 ; GFX1200-NEXT: s_wait_alu 0xfffd -; GFX1200-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f32 v2, v3, v2 -; GFX1200-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s9 -; GFX1200-NEXT: .LBB12_24: -; GFX1200-NEXT: s_and_b32 s9, s7, 0x7fffffff -; GFX1200-NEXT: s_and_b32 s12, s2, 0x7fffffff +; GFX1200-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s9 +; GFX1200-NEXT: .LBB12_16: +; GFX1200-NEXT: s_and_b32 s9, s8, 0x7fffffff +; GFX1200-NEXT: s_and_b32 s11, s3, 0x7fffffff ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_ngt_f32 s9, s12 -; GFX1200-NEXT: s_cbranch_scc0 .LBB12_26 -; GFX1200-NEXT: ; %bb.25: ; %frem.else78 -; GFX1200-NEXT: s_cmp_eq_f32 s9, s12 -; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7 +; GFX1200-NEXT: s_cmp_ngt_f32 s9, s11 +; GFX1200-NEXT: s_cbranch_scc0 .LBB12_18 +; GFX1200-NEXT: ; %bb.17: ; %frem.else47 +; GFX1200-NEXT: s_cmp_eq_f32 s9, s11 +; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s8 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_cndmask_b32_e32 v3, s7, v3, vcc_lo -; GFX1200-NEXT: s_cbranch_execz .LBB12_27 -; GFX1200-NEXT: s_branch .LBB12_32 -; GFX1200-NEXT: .LBB12_26: -; GFX1200-NEXT: ; implicit-def: $vgpr3 -; GFX1200-NEXT: .LBB12_27: ; %frem.compute77 -; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |s2| -; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s7| -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, s7 +; GFX1200-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB12_19 +; GFX1200-NEXT: s_branch .LBB12_24 +; GFX1200-NEXT: .LBB12_18: +; GFX1200-NEXT: ; implicit-def: $vgpr1 +; GFX1200-NEXT: .LBB12_19: ; %frem.compute46 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |s3| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s8| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, s8 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_ldexp_f32 v4, v4, 1 -; GFX1200-NEXT: v_ldexp_f32 v5, v3, 12 -; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX1200-NEXT: v_ldexp_f32 v5, v1, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-NEXT: v_readfirstlane_b32 s11, v6 +; GFX1200-NEXT: v_readfirstlane_b32 s10, v6 ; GFX1200-NEXT: v_div_scale_f32 v8, null, v4, v4, 1.0 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_readfirstlane_b32 s12, v3 -; GFX1200-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX1200-NEXT: v_readfirstlane_b32 s11, v1 +; GFX1200-NEXT: v_add_nc_u32_e32 v1, -1, v1 ; GFX1200-NEXT: v_rcp_f32_e32 v9, v8 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_not_b32_e32 v7, v3 +; GFX1200-NEXT: v_not_b32_e32 v7, v1 ; GFX1200-NEXT: v_add_nc_u32_e32 v7, v7, v6 ; GFX1200-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0 ; GFX1200-NEXT: s_denorm_mode 15 @@ -15938,19 +15827,19 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_div_fmas_f32 v6, v6, v9, v10 ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 ; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 -; GFX1200-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body85.preheader -; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 +; GFX1200-NEXT: s_cbranch_vccnz .LBB12_23 +; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX1200-NEXT: s_sub_co_i32 s10, s10, s11 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX1200-NEXT: s_add_co_i32 s10, s10, 12 +; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body54 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v8, v5 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_add_co_i32 s11, s11, -12 +; GFX1200-NEXT: s_add_co_i32 s10, s10, -12 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_gt_i32 s11, 12 +; GFX1200-NEXT: s_cmp_gt_i32 s10, 12 ; GFX1200-NEXT: v_mul_f32_e32 v5, v8, v6 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_rndne_f32_e32 v5, v5 @@ -15963,11 +15852,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX1200-NEXT: v_ldexp_f32 v5, v5, 12 -; GFX1200-NEXT: s_cbranch_scc1 .LBB12_29 -; GFX1200-NEXT: ; %bb.30: ; %Flow -; GFX1200-NEXT: v_mov_b32_e32 v7, s11 +; GFX1200-NEXT: s_cbranch_scc1 .LBB12_21 +; GFX1200-NEXT: ; %bb.22: ; %Flow117 +; GFX1200-NEXT: v_mov_b32_e32 v7, s10 ; GFX1200-NEXT: v_mov_b32_e32 v5, v8 -; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit55 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v7, -11, v7 ; GFX1200-NEXT: v_ldexp_f32 v5, v5, v7 @@ -15983,43 +15872,141 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffd ; GFX1200-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f32 v3, v4, v3 -; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s7 +; GFX1200-NEXT: v_ldexp_f32 v1, v4, v1 +; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s8 +; GFX1200-NEXT: .LBB12_24: +; GFX1200-NEXT: s_and_b32 s8, s6, 0x7fffffff +; GFX1200-NEXT: s_and_b32 s11, s2, 0x7fffffff +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_ngt_f32 s8, s11 +; GFX1200-NEXT: s_cbranch_scc0 .LBB12_26 +; GFX1200-NEXT: ; %bb.25: ; %frem.else78 +; GFX1200-NEXT: s_cmp_eq_f32 s8, s11 +; GFX1200-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, s6 +; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v4, s6, v4, vcc_lo +; GFX1200-NEXT: s_cbranch_execz .LBB12_27 +; GFX1200-NEXT: s_branch .LBB12_32 +; GFX1200-NEXT: .LBB12_26: +; GFX1200-NEXT: ; implicit-def: $vgpr4 +; GFX1200-NEXT: .LBB12_27: ; %frem.compute77 +; GFX1200-NEXT: v_frexp_mant_f32_e64 v5, |s2| +; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |s6| +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v7, s6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_ldexp_f32 v5, v5, 1 +; GFX1200-NEXT: v_ldexp_f32 v6, v4, 12 +; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s2 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-NEXT: v_readfirstlane_b32 s10, v7 +; GFX1200-NEXT: v_div_scale_f32 v9, null, v5, v5, 1.0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_readfirstlane_b32 s11, v4 +; GFX1200-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; GFX1200-NEXT: v_rcp_f32_e32 v10, v9 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_not_b32_e32 v8, v4 +; GFX1200-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX1200-NEXT: v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0 +; GFX1200-NEXT: s_denorm_mode 15 +; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX1200-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX1200-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX1200-NEXT: s_denorm_mode 12 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v8 +; GFX1200-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 +; GFX1200-NEXT: s_cbranch_vccnz .LBB12_31 +; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX1200-NEXT: s_sub_co_i32 s10, s10, s11 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s10, s10, 12 +; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v9, v6 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_i32 s10, s10, -12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_gt_i32 s10, 12 +; GFX1200-NEXT: v_mul_f32_e32 v6, v9, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_rndne_f32_e32 v6, v6 +; GFX1200-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_fma_f32 v6, v6, v5, v9 +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX1200-NEXT: v_add_f32_e32 v8, v6, v5 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX1200-NEXT: v_ldexp_f32 v6, v6, 12 +; GFX1200-NEXT: s_cbranch_scc1 .LBB12_29 +; GFX1200-NEXT: ; %bb.30: ; %Flow +; GFX1200-NEXT: v_mov_b32_e32 v8, s10 +; GFX1200-NEXT: v_mov_b32_e32 v6, v9 +; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_add_nc_u32_e32 v8, -11, v8 +; GFX1200-NEXT: v_ldexp_f32 v6, v6, v8 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX1200-NEXT: v_rndne_f32_e32 v7, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 +; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v6 +; GFX1200-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX1200-NEXT: s_wait_alu 0xfffd +; GFX1200-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX1200-NEXT: v_bfi_b32 v4, 0x7fffffff, v4, s6 ; GFX1200-NEXT: .LBB12_32: ; %Flow116 -; GFX1200-NEXT: s_cmp_lg_f32 s6, 0 -; GFX1200-NEXT: v_mov_b32_e32 v4, 0 -; GFX1200-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1200-NEXT: s_cmp_nge_f32 s5, 0x7f800000 +; GFX1200-NEXT: s_cmp_lg_f32 s5, 0 +; GFX1200-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0x7f800000, v2 ; GFX1200-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_and_b32 vcc_lo, s5, s6 +; GFX1200-NEXT: s_and_b32 vcc_lo, vcc_lo, s5 ; GFX1200-NEXT: s_cmp_lg_f32 s4, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v3, vcc_lo ; GFX1200-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1200-NEXT: s_cmp_nge_f32 s8, 0x7f800000 +; GFX1200-NEXT: s_cmp_nge_f32 s7, 0x7f800000 ; GFX1200-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 vcc_lo, s5, s4 ; GFX1200-NEXT: s_cmp_lg_f32 s3, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v0, vcc_lo +; GFX1200-NEXT: v_mov_b32_e32 v0, 0 ; GFX1200-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1200-NEXT: s_cmp_nge_f32 s10, 0x7f800000 +; GFX1200-NEXT: s_cmp_nge_f32 s9, 0x7f800000 ; GFX1200-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 vcc_lo, s4, s3 ; GFX1200-NEXT: s_cmp_lg_f32 s2, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v2, vcc_lo +; GFX1200-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v1, vcc_lo ; GFX1200-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1200-NEXT: s_cmp_nge_f32 s9, 0x7f800000 +; GFX1200-NEXT: s_cmp_nge_f32 s8, 0x7f800000 ; GFX1200-NEXT: s_cselect_b32 s3, -1, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_and_b32 vcc_lo, s3, s2 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v3, vcc_lo -; GFX1200-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1200-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v4, vcc_lo +; GFX1200-NEXT: global_store_b128 v0, v[5:8], s[0:1] ; GFX1200-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll b/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll new file mode 100644 index 0000000000000..658fdf3d014d7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -march=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s + +define amdgpu_kernel void @runningSum(ptr addrspace(1) %out, i32 %inputElement0, i32 %inputElement1, i32 %inputIter) { +; GCN-LABEL: runningSum: +; GCN: ; %bb.0: ; %bb.0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x30 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: .LBB0_1: ; %loopBody +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_add_i32 s1, s2, s1 +; GCN-NEXT: s_add_i32 s0, s2, s0 +; GCN-NEXT: s_add_i32 s3, s3, -1 +; GCN-NEXT: s_cmp_lg_u32 s3, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB0_1 +; GCN-NEXT: ; %bb.2: ; %loopExit +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dwordx2 v0, v[2:3], s[2:3] +; GCN-NEXT: s_endpgm +bb.0: + br label %preheader + +preheader: + %vecElement0 = insertelement <2 x i32> poison, i32 %inputElement0, i64 0 + %broadcast0 = shufflevector <2 x i32> %vecElement0, <2 x i32> poison, <2 x i32> zeroinitializer + %vecElement1 = insertelement <2 x i32> poison, i32 %inputElement1, i64 0 + %broadcast1 = shufflevector <2 x i32> %vecElement1, <2 x i32> poison, <2 x i32> zeroinitializer + br label %loopBody + +loopBody: + %previousSum = phi <2 x i32> [ %broadcast1, %preheader ], [ %runningSum, %loopBody ] + %iterCount = phi i32 [ %inputIter, %preheader ], [ %itersLeft, %loopBody ] + %runningSum = add <2 x i32> %broadcast1, %previousSum + %itersLeft = sub i32 %iterCount, 1 + %cond = icmp eq i32 %itersLeft, 0 + br i1 %cond, label %loopExit, label %loopBody, !llvm.loop !0 + +loopExit: + store <2 x i32> %runningSum, ptr addrspace(1) %out + ret void +} + +!0 = !{!"llvm.loop.mustprogress"} + diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 5944342b2642a..2ba9a26fcc0c0 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -3356,181 +3356,152 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-LABEL: srem_v2i64: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 16 +; TONGA-NEXT: v_mov_b32_e32 v4, s6 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v0, s0 -; TONGA-NEXT: v_mov_b32_e32 v4, s6 -; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: v_mov_b32_e32 v5, s7 +; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_readfirstlane_b32 s1, v1 -; TONGA-NEXT: v_readfirstlane_b32 s0, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_readfirstlane_b32 s3, v5 -; TONGA-NEXT: v_readfirstlane_b32 s2, v4 -; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] -; TONGA-NEXT: s_mov_b32 s6, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 -; TONGA-NEXT: s_cbranch_scc0 .LBB10_3 +; TONGA-NEXT: v_or_b32_e32 v9, v5, v1 +; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; TONGA-NEXT: s_cbranch_vccz .LBB10_7 ; TONGA-NEXT: ; %bb.1: -; TONGA-NEXT: s_ashr_i32 s6, s1, 31 -; TONGA-NEXT: s_add_u32 s8, s0, s6 -; TONGA-NEXT: s_mov_b32 s7, s6 -; TONGA-NEXT: s_addc_u32 s9, s1, s6 -; TONGA-NEXT: s_xor_b64 s[6:7], s[8:9], s[6:7] -; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s6 -; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s7 -; TONGA-NEXT: s_sub_u32 s1, 0, s6 -; TONGA-NEXT: s_subb_u32 s10, 0, s7 -; TONGA-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 -; TONGA-NEXT: v_rcp_f32_e32 v0, v0 -; TONGA-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; TONGA-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; TONGA-NEXT: v_trunc_f32_e32 v1, v1 -; TONGA-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v1 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 -; TONGA-NEXT: v_mul_lo_u32 v4, s1, v8 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s1, v9, 0 -; TONGA-NEXT: v_mul_lo_u32 v5, s10, v9 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v4 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v1, v5 -; TONGA-NEXT: v_mul_hi_u32 v10, v9, v0 -; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], v9, v11, 0 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v8, v0, 0 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v4 -; TONGA-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc -; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], v8, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v10, v0 -; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v12, v1, vcc -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v9, v0 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s1, v10, 0 -; TONGA-NEXT: v_mul_lo_u32 v8, s1, v11 -; TONGA-NEXT: v_mul_lo_u32 v9, s10, v10 -; TONGA-NEXT: v_mul_hi_u32 v12, v10, v0 -; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], v11, v0, 0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v8, v1 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v9, v1 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], v10, v1, 0 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v1, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v12, v8 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v0, v8 +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; TONGA-NEXT: v_xor_b32_e32 v14, v9, v8 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v8 +; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v14 +; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v1 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v14 +; TONGA-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc +; TONGA-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 +; TONGA-NEXT: v_rcp_f32_e32 v8, v8 +; TONGA-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; TONGA-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; TONGA-NEXT: v_trunc_f32_e32 v9, v9 +; TONGA-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 +; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v9 +; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v8 +; TONGA-NEXT: v_mul_lo_u32 v10, v15, v12 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v13, 0 +; TONGA-NEXT: v_mul_lo_u32 v11, v16, v13 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v10 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v9, v11 +; TONGA-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v13, v11, 0 +; TONGA-NEXT: v_mul_hi_u32 v17, v13, v8 +; TONGA-NEXT: v_add_u32_e32 v17, vcc, v17, v9 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v12, v8, 0 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v10, vcc +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v11, 0 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v17, v8 +; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v18, v9, vcc +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v10 ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v8, v4 -; TONGA-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; TONGA-NEXT: s_ashr_i32 s10, s3, 31 -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; TONGA-NEXT: s_add_u32 s8, s2, s10 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v10, v0 -; TONGA-NEXT: s_mov_b32 s11, s10 -; TONGA-NEXT: s_addc_u32 s9, s3, s10 -; TONGA-NEXT: v_addc_u32_e32 v5, vcc, v11, v1, vcc -; TONGA-NEXT: s_xor_b64 s[12:13], s[8:9], s[10:11] -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s12, v5, 0 -; TONGA-NEXT: v_mul_hi_u32 v8, s12, v4 -; TONGA-NEXT: v_readfirstlane_b32 s1, v1 -; TONGA-NEXT: v_readfirstlane_b32 s3, v0 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s13, v5, 0 -; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], s13, v4, 0 -; TONGA-NEXT: v_readfirstlane_b32 s14, v8 -; TONGA-NEXT: s_add_u32 s3, s14, s3 -; TONGA-NEXT: s_addc_u32 s1, 0, s1 -; TONGA-NEXT: v_readfirstlane_b32 s14, v4 -; TONGA-NEXT: v_readfirstlane_b32 s9, v5 -; TONGA-NEXT: s_add_u32 s3, s3, s14 -; TONGA-NEXT: v_readfirstlane_b32 s8, v1 -; TONGA-NEXT: s_addc_u32 s1, s1, s9 -; TONGA-NEXT: s_addc_u32 s3, s8, 0 -; TONGA-NEXT: v_readfirstlane_b32 s8, v0 -; TONGA-NEXT: s_add_u32 s1, s1, s8 -; TONGA-NEXT: v_mov_b32_e32 v0, s1 -; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s6, v0, 0 -; TONGA-NEXT: s_addc_u32 s3, 0, s3 -; TONGA-NEXT: s_mul_i32 s3, s6, s3 -; TONGA-NEXT: v_readfirstlane_b32 s14, v1 -; TONGA-NEXT: s_add_i32 s3, s14, s3 -; TONGA-NEXT: s_mul_i32 s1, s7, s1 -; TONGA-NEXT: s_add_i32 s3, s3, s1 -; TONGA-NEXT: s_sub_i32 s1, s13, s3 -; TONGA-NEXT: v_readfirstlane_b32 s14, v0 -; TONGA-NEXT: s_sub_u32 s12, s12, s14 -; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 -; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s18, s12, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 -; TONGA-NEXT: s_subb_u32 s19, s1, 0 -; TONGA-NEXT: s_cmp_ge_u32 s19, s7 -; TONGA-NEXT: s_cselect_b32 s20, -1, 0 -; TONGA-NEXT: s_cmp_ge_u32 s18, s6 -; TONGA-NEXT: s_cselect_b32 s21, -1, 0 -; TONGA-NEXT: s_cmp_eq_u32 s19, s7 -; TONGA-NEXT: s_cselect_b32 s20, s21, s20 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 -; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 -; TONGA-NEXT: s_subb_u32 s1, s1, 0 -; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 -; TONGA-NEXT: s_cselect_b32 s1, s1, s19 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 -; TONGA-NEXT: s_subb_u32 s3, s13, s3 -; TONGA-NEXT: s_cmp_ge_u32 s3, s7 -; TONGA-NEXT: s_cselect_b32 s13, -1, 0 -; TONGA-NEXT: s_cmp_ge_u32 s12, s6 -; TONGA-NEXT: s_cselect_b32 s6, -1, 0 -; TONGA-NEXT: s_cmp_eq_u32 s3, s7 -; TONGA-NEXT: s_cselect_b32 s6, s6, s13 -; TONGA-NEXT: s_cmp_lg_u32 s6, 0 -; TONGA-NEXT: s_cselect_b32 s7, s1, s3 -; TONGA-NEXT: s_cselect_b32 s6, s16, s12 -; TONGA-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] -; TONGA-NEXT: s_sub_u32 s6, s6, s10 -; TONGA-NEXT: s_subb_u32 s7, s7, s10 -; TONGA-NEXT: s_cbranch_execnz .LBB10_4 +; TONGA-NEXT: v_add_u32_e32 v17, vcc, v13, v8 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v12, v9, vcc +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v17, 0 +; TONGA-NEXT: v_mul_lo_u32 v12, v15, v18 +; TONGA-NEXT: v_mul_lo_u32 v13, v16, v17 +; TONGA-NEXT: v_mul_hi_u32 v15, v17, v8 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v18, v8, 0 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v12, v9 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v13, v9 +; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v17, v9, 0 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v18, v9, 0 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v15, v12 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v12, v10 +; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v13, v11, vcc +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v17, v8 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v18, v9, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v4, v12 +; TONGA-NEXT: v_xor_b32_e32 v13, v8, v12 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v13, v11, 0 +; TONGA-NEXT: v_mul_hi_u32 v15, v13, v10 +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, v5, v12, vcc +; TONGA-NEXT: v_xor_b32_e32 v5, v5, v12 +; TONGA-NEXT: v_add_u32_e32 v15, vcc, v15, v8 +; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v9, vcc +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v10, 0 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v11, 0 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v15, v8 +; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v8, v10 +; TONGA-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; TONGA-NEXT: v_mul_lo_u32 v11, v14, v8 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v10, 0 +; TONGA-NEXT: v_mul_lo_u32 v10, v1, v10 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v11, v9 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v9 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v5, v9 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v13, v8 +; TONGA-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, vcc +; TONGA-NEXT: v_sub_u32_e64 v11, s[0:1], v8, v14 +; TONGA-NEXT: v_subbrev_u32_e64 v13, s[2:3], 0, v10, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v13, v1 +; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v11, v14 +; TONGA-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v13, v1 +; TONGA-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] +; TONGA-NEXT: v_sub_u32_e64 v16, s[0:1], v11, v14 +; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc +; TONGA-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v10, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 +; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v14 +; TONGA-NEXT: v_cndmask_b32_e64 v10, v13, v10, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v11, v11, v16, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v10, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc +; TONGA-NEXT: v_xor_b32_e32 v5, v5, v12 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v12 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v5, v12 +; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v1, v12, vcc +; TONGA-NEXT: s_cbranch_execnz .LBB10_3 ; TONGA-NEXT: .LBB10_2: -; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s0 -; TONGA-NEXT: s_sub_i32 s1, 0, s0 +; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v0 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 ; TONGA-NEXT: v_mov_b32_e32 v9, 0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0 -; TONGA-NEXT: v_mul_lo_u32 v1, s1, v0 -; TONGA-NEXT: v_mul_hi_u32 v1, v0, v1 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; TONGA-NEXT: v_mul_hi_u32 v0, s2, v0 -; TONGA-NEXT: v_mul_lo_u32 v0, v0, s0 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s0, v0 -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s0, v0 -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 -; TONGA-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc -; TONGA-NEXT: s_branch .LBB10_5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 +; TONGA-NEXT: v_mul_lo_u32 v5, v5, v1 +; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; TONGA-NEXT: v_mul_hi_u32 v1, v4, v1 +; TONGA-NEXT: v_mul_lo_u32 v1, v1, v0 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 +; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v0, v1 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v1, v0 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; TONGA-NEXT: v_cndmask_b32_e32 v8, v1, v4, vcc ; TONGA-NEXT: .LBB10_3: -; TONGA-NEXT: ; implicit-def: $sgpr6_sgpr7 -; TONGA-NEXT: s_branch .LBB10_2 -; TONGA-NEXT: .LBB10_4: -; TONGA-NEXT: v_mov_b32_e32 v9, s7 -; TONGA-NEXT: v_mov_b32_e32 v8, s6 -; TONGA-NEXT: .LBB10_5: ; TONGA-NEXT: v_or_b32_e32 v1, v7, v3 ; TONGA-NEXT: v_mov_b32_e32 v0, 0 ; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; TONGA-NEXT: s_cbranch_vccz .LBB10_9 -; TONGA-NEXT: ; %bb.6: +; TONGA-NEXT: s_cbranch_vccz .LBB10_8 +; TONGA-NEXT: ; %bb.4: ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v0 ; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc @@ -3636,8 +3607,8 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 ; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v0, v11 ; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v1, v11, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB10_8 -; TONGA-NEXT: .LBB10_7: +; TONGA-NEXT: s_cbranch_execnz .LBB10_6 +; TONGA-NEXT: .LBB10_5: ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v2 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, 0, v2 ; TONGA-NEXT: v_mov_b32_e32 v11, 0 @@ -3656,13 +3627,16 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc -; TONGA-NEXT: .LBB10_8: +; TONGA-NEXT: .LBB10_6: ; TONGA-NEXT: v_mov_b32_e32 v0, s4 ; TONGA-NEXT: v_mov_b32_e32 v1, s5 ; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; TONGA-NEXT: s_endpgm -; TONGA-NEXT: .LBB10_9: -; TONGA-NEXT: s_branch .LBB10_7 +; TONGA-NEXT: .LBB10_7: +; TONGA-NEXT: ; implicit-def: $vgpr8_vgpr9 +; TONGA-NEXT: s_branch .LBB10_2 +; TONGA-NEXT: .LBB10_8: +; TONGA-NEXT: s_branch .LBB10_5 ; ; EG-LABEL: srem_v2i64: ; EG: ; %bb.0: @@ -6182,6 +6156,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-LABEL: srem_v4i64: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 ; TONGA-NEXT: v_mov_b32_e32 v0, s6 @@ -6201,279 +6176,249 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mov_b32_e32 v4, s0 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; TONGA-NEXT: s_waitcnt vmcnt(3) -; TONGA-NEXT: v_readfirstlane_b32 s3, v15 -; TONGA-NEXT: v_readfirstlane_b32 s2, v14 ; TONGA-NEXT: s_waitcnt vmcnt(2) -; TONGA-NEXT: v_readfirstlane_b32 s1, v11 -; TONGA-NEXT: v_readfirstlane_b32 s0, v10 -; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] -; TONGA-NEXT: s_mov_b32 s6, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 -; TONGA-NEXT: s_cbranch_scc0 .LBB12_3 +; TONGA-NEXT: v_or_b32_e32 v9, v15, v11 +; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; TONGA-NEXT: s_cbranch_vccz .LBB12_13 ; TONGA-NEXT: ; %bb.1: -; TONGA-NEXT: s_ashr_i32 s6, s1, 31 -; TONGA-NEXT: s_add_u32 s8, s0, s6 -; TONGA-NEXT: s_mov_b32 s7, s6 -; TONGA-NEXT: s_addc_u32 s9, s1, s6 -; TONGA-NEXT: s_xor_b64 s[6:7], s[8:9], s[6:7] -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, s6 -; TONGA-NEXT: v_cvt_f32_u32_e32 v9, s7 -; TONGA-NEXT: s_sub_u32 s1, 0, s6 -; TONGA-NEXT: s_subb_u32 s10, 0, s7 -; TONGA-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 -; TONGA-NEXT: v_rcp_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; TONGA-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; TONGA-NEXT: v_trunc_f32_e32 v9, v9 -; TONGA-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 -; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v15, v8 -; TONGA-NEXT: v_mul_lo_u32 v10, s1, v14 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s1, v15, 0 -; TONGA-NEXT: v_mul_lo_u32 v11, s10, v15 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v10 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v9, v11 -; TONGA-NEXT: v_mul_hi_u32 v18, v15, v8 -; TONGA-NEXT: v_mad_u64_u32 v[9:10], s[8:9], v15, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v9 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], v14, v8, 0 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v10, vcc -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v18, v8 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v19, v9, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v10 -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v15, v8 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, v14, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s1, v18, 0 -; TONGA-NEXT: v_mul_lo_u32 v14, s1, v19 -; TONGA-NEXT: v_mul_lo_u32 v15, s10, v18 -; TONGA-NEXT: v_mul_hi_u32 v20, v18, v8 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v19, v8, 0 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v14, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v15, v9 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[8:9], v18, v9, 0 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], v19, v9, 0 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v20, v14 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v14, v10 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v15, v11, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 -; TONGA-NEXT: s_ashr_i32 s10, s3, 31 -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: s_add_u32 s8, s2, s10 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v18, v8 -; TONGA-NEXT: s_mov_b32 s11, s10 -; TONGA-NEXT: s_addc_u32 s9, s3, s10 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v19, v9, vcc -; TONGA-NEXT: s_xor_b64 s[12:13], s[8:9], s[10:11] -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s12, v11, 0 -; TONGA-NEXT: v_mul_hi_u32 v14, s12, v10 -; TONGA-NEXT: v_readfirstlane_b32 s1, v9 -; TONGA-NEXT: v_readfirstlane_b32 s3, v8 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s13, v11, 0 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[8:9], s13, v10, 0 -; TONGA-NEXT: v_readfirstlane_b32 s14, v14 -; TONGA-NEXT: s_add_u32 s3, s14, s3 -; TONGA-NEXT: s_addc_u32 s1, 0, s1 -; TONGA-NEXT: v_readfirstlane_b32 s14, v10 -; TONGA-NEXT: v_readfirstlane_b32 s9, v11 -; TONGA-NEXT: s_add_u32 s3, s3, s14 -; TONGA-NEXT: v_readfirstlane_b32 s8, v9 -; TONGA-NEXT: s_addc_u32 s1, s1, s9 -; TONGA-NEXT: s_addc_u32 s3, s8, 0 -; TONGA-NEXT: v_readfirstlane_b32 s8, v8 -; TONGA-NEXT: s_add_u32 s1, s1, s8 -; TONGA-NEXT: v_mov_b32_e32 v8, s1 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s6, v8, 0 -; TONGA-NEXT: s_addc_u32 s3, 0, s3 -; TONGA-NEXT: s_mul_i32 s3, s6, s3 -; TONGA-NEXT: v_readfirstlane_b32 s14, v9 -; TONGA-NEXT: s_add_i32 s3, s14, s3 -; TONGA-NEXT: s_mul_i32 s1, s7, s1 -; TONGA-NEXT: s_add_i32 s3, s3, s1 -; TONGA-NEXT: s_sub_i32 s1, s13, s3 -; TONGA-NEXT: v_readfirstlane_b32 s14, v8 -; TONGA-NEXT: s_sub_u32 s12, s12, s14 -; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 -; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s18, s12, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 -; TONGA-NEXT: s_subb_u32 s19, s1, 0 -; TONGA-NEXT: s_cmp_ge_u32 s19, s7 -; TONGA-NEXT: s_cselect_b32 s20, -1, 0 -; TONGA-NEXT: s_cmp_ge_u32 s18, s6 -; TONGA-NEXT: s_cselect_b32 s21, -1, 0 -; TONGA-NEXT: s_cmp_eq_u32 s19, s7 -; TONGA-NEXT: s_cselect_b32 s20, s21, s20 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 -; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s21, s18, s6 -; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 -; TONGA-NEXT: s_subb_u32 s1, s1, 0 -; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s21, s18 -; TONGA-NEXT: s_cselect_b32 s1, s1, s19 -; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 -; TONGA-NEXT: s_subb_u32 s3, s13, s3 -; TONGA-NEXT: s_cmp_ge_u32 s3, s7 -; TONGA-NEXT: s_cselect_b32 s13, -1, 0 -; TONGA-NEXT: s_cmp_ge_u32 s12, s6 -; TONGA-NEXT: s_cselect_b32 s6, -1, 0 -; TONGA-NEXT: s_cmp_eq_u32 s3, s7 -; TONGA-NEXT: s_cselect_b32 s6, s6, s13 -; TONGA-NEXT: s_cmp_lg_u32 s6, 0 -; TONGA-NEXT: s_cselect_b32 s7, s1, s3 -; TONGA-NEXT: s_cselect_b32 s6, s16, s12 -; TONGA-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] -; TONGA-NEXT: s_sub_u32 s6, s6, s10 -; TONGA-NEXT: s_subb_u32 s7, s7, s10 -; TONGA-NEXT: s_cbranch_execnz .LBB12_4 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v11 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v8 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v11, v8, vcc +; TONGA-NEXT: v_xor_b32_e32 v9, v9, v8 +; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8 +; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v9 +; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v8 +; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v9 +; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v8, vcc +; TONGA-NEXT: v_madmk_f32 v11, v18, 0x4f800000, v11 +; TONGA-NEXT: v_rcp_f32_e32 v11, v11 +; TONGA-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; TONGA-NEXT: v_mul_f32_e32 v18, 0x2f800000, v11 +; TONGA-NEXT: v_trunc_f32_e32 v18, v18 +; TONGA-NEXT: v_madmk_f32 v11, v18, 0xcf800000, v11 +; TONGA-NEXT: v_cvt_u32_f32_e32 v22, v18 +; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 +; TONGA-NEXT: v_mul_lo_u32 v20, v23, v22 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v21, v24, v11 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v20 +; TONGA-NEXT: v_add_u32_e32 v21, vcc, v19, v21 +; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0 +; TONGA-NEXT: v_mul_hi_u32 v25, v11, v18 +; TONGA-NEXT: v_add_u32_e32 v25, vcc, v25, v19 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v22, v18, 0 +; TONGA-NEXT: v_addc_u32_e32 v26, vcc, 0, v20, vcc +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v22, v21, 0 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v18 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v26, v19, vcc +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v21, vcc +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v20 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v22, v19, vcc +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v22, v23, v25 +; TONGA-NEXT: v_mul_lo_u32 v23, v24, v11 +; TONGA-NEXT: v_mul_hi_u32 v24, v11, v18 +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v18, 0 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v22, v19 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v23, v19 +; TONGA-NEXT: v_mad_u64_u32 v[22:23], s[0:1], v11, v19, 0 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v19, 0 +; TONGA-NEXT: v_add_u32_e32 v22, vcc, v24, v22 +; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; TONGA-NEXT: v_add_u32_e32 v20, vcc, v22, v20 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v23, v21, vcc +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v20, v18 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v25, v19, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v22 +; TONGA-NEXT: v_xor_b32_e32 v23, v18, v22 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v20, 0 +; TONGA-NEXT: v_mul_hi_u32 v21, v23, v11 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v22, vcc +; TONGA-NEXT: v_xor_b32_e32 v15, v15, v22 +; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v18 +; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v19, vcc +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v11, 0 +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v15, v20, 0 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v24, v18 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v25, v19, vcc +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v20 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; TONGA-NEXT: v_mul_lo_u32 v20, v9, v18 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v11, v8, v11 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v20, v19 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19 +; TONGA-NEXT: v_sub_u32_e32 v19, vcc, v15, v11 +; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v23, v18 +; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, vcc +; TONGA-NEXT: v_sub_u32_e64 v20, s[0:1], v18, v9 +; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v19, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v23, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v9 +; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v15, v11, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v24, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v8 +; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v23, v23, v24, s[2:3] +; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v20, v9 +; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v18, v9 +; TONGA-NEXT: v_subbrev_u32_e64 v19, s[0:1], 0, v19, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23 +; TONGA-NEXT: v_cndmask_b32_e32 v8, v15, v9, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v24, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e32 v9, v18, v20, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v8, v11, v19, vcc +; TONGA-NEXT: v_xor_b32_e32 v9, v9, v22 +; TONGA-NEXT: v_xor_b32_e32 v11, v8, v22 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v9, v22 +; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v11, v22, vcc +; TONGA-NEXT: s_cbranch_execnz .LBB12_3 ; TONGA-NEXT: .LBB12_2: -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, s0 -; TONGA-NEXT: s_sub_i32 s1, 0, s0 +; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v10 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v10 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; TONGA-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_lo_u32 v9, s1, v8 +; TONGA-NEXT: v_mul_lo_u32 v9, v9, v8 ; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; TONGA-NEXT: v_mul_hi_u32 v8, s2, v8 -; TONGA-NEXT: v_mul_lo_u32 v8, v8, s0 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s2, v8 -; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, s0, v8 -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s0, v8 +; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 +; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, s0, v8 -; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s0, v8 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; TONGA-NEXT: v_mov_b32_e32 v9, 0 -; TONGA-NEXT: s_branch .LBB12_5 ; TONGA-NEXT: .LBB12_3: -; TONGA-NEXT: ; implicit-def: $sgpr6_sgpr7 -; TONGA-NEXT: s_branch .LBB12_2 -; TONGA-NEXT: .LBB12_4: -; TONGA-NEXT: v_mov_b32_e32 v9, s7 -; TONGA-NEXT: v_mov_b32_e32 v8, s6 -; TONGA-NEXT: .LBB12_5: ; TONGA-NEXT: v_or_b32_e32 v11, v17, v13 ; TONGA-NEXT: v_mov_b32_e32 v10, 0 ; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; TONGA-NEXT: s_cbranch_vccz .LBB12_15 -; TONGA-NEXT: ; %bb.6: +; TONGA-NEXT: s_cbranch_vccz .LBB12_14 +; TONGA-NEXT: ; %bb.4: ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v13 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, v12, v10 ; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v13, v10, vcc -; TONGA-NEXT: v_xor_b32_e32 v11, v11, v10 -; TONGA-NEXT: v_xor_b32_e32 v10, v13, v10 -; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v11 -; TONGA-NEXT: v_cvt_f32_u32_e32 v14, v10 -; TONGA-NEXT: v_sub_u32_e32 v22, vcc, 0, v11 -; TONGA-NEXT: v_subb_u32_e32 v23, vcc, 0, v10, vcc -; TONGA-NEXT: v_madmk_f32 v13, v14, 0x4f800000, v13 -; TONGA-NEXT: v_rcp_f32_e32 v13, v13 -; TONGA-NEXT: v_mul_f32_e32 v13, 0x5f7ffffc, v13 -; TONGA-NEXT: v_mul_f32_e32 v14, 0x2f800000, v13 -; TONGA-NEXT: v_trunc_f32_e32 v14, v14 -; TONGA-NEXT: v_madmk_f32 v13, v14, 0xcf800000, v13 -; TONGA-NEXT: v_cvt_u32_f32_e32 v20, v14 -; TONGA-NEXT: v_cvt_u32_f32_e32 v21, v13 -; TONGA-NEXT: v_mul_lo_u32 v15, v22, v20 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v21, 0 -; TONGA-NEXT: v_mul_lo_u32 v18, v23, v21 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v14, v15 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v18 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v21, v18, 0 -; TONGA-NEXT: v_mul_hi_u32 v19, v21, v13 -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v19, v14 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v20, v13, 0 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v18, 0 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v24, v13 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v15, v14, vcc -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v18 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v13 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v20, v14, vcc -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v24, 0 -; TONGA-NEXT: v_mul_lo_u32 v15, v22, v25 -; TONGA-NEXT: v_mul_lo_u32 v20, v23, v24 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v13, 0 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v15, v14 -; TONGA-NEXT: v_add_u32_e32 v20, vcc, v20, v14 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v24, v20, 0 -; TONGA-NEXT: v_mul_hi_u32 v13, v24, v13 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v20, 0 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v14 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v15, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v18 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v14, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v21, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v20 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v24, v13 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v25, v14, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v19, 31, v17 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v16, v19 -; TONGA-NEXT: v_xor_b32_e32 v20, v13, v19 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v20, v18, 0 -; TONGA-NEXT: v_mul_hi_u32 v21, v20, v15 -; TONGA-NEXT: v_addc_u32_e32 v17, vcc, v17, v19, vcc -; TONGA-NEXT: v_xor_b32_e32 v22, v17, v19 -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v21, v13 -; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v14, vcc -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v15, 0 -; TONGA-NEXT: v_mad_u64_u32 v[17:18], s[0:1], v22, v18, 0 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v21, v13 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v23, v14, vcc -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v13, v17 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc -; TONGA-NEXT: v_mul_lo_u32 v17, v11, v13 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v11, v15, 0 -; TONGA-NEXT: v_mul_lo_u32 v15, v10, v15 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v17, v14 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v15, v14 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, v22, v14 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v20, v13 -; TONGA-NEXT: v_subb_u32_e64 v15, s[0:1], v15, v10, vcc -; TONGA-NEXT: v_sub_u32_e64 v17, s[0:1], v13, v11 -; TONGA-NEXT: v_subbrev_u32_e64 v18, s[2:3], 0, v15, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v18, v10 -; TONGA-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v17, v11 +; TONGA-NEXT: v_xor_b32_e32 v15, v11, v10 +; TONGA-NEXT: v_xor_b32_e32 v20, v13, v10 +; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v15 +; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v20 +; TONGA-NEXT: v_sub_u32_e32 v21, vcc, 0, v15 +; TONGA-NEXT: v_subb_u32_e32 v22, vcc, 0, v20, vcc +; TONGA-NEXT: v_madmk_f32 v10, v11, 0x4f800000, v10 +; TONGA-NEXT: v_rcp_f32_e32 v10, v10 +; TONGA-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 +; TONGA-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 +; TONGA-NEXT: v_trunc_f32_e32 v11, v11 +; TONGA-NEXT: v_madmk_f32 v10, v11, 0xcf800000, v10 +; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v11 +; TONGA-NEXT: v_cvt_u32_f32_e32 v19, v10 +; TONGA-NEXT: v_mul_lo_u32 v13, v21, v18 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v21, v19, 0 +; TONGA-NEXT: v_mul_lo_u32 v14, v22, v19 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v13 +; TONGA-NEXT: v_add_u32_e32 v23, vcc, v11, v14 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v19, v23, 0 +; TONGA-NEXT: v_mul_hi_u32 v11, v19, v10 +; TONGA-NEXT: v_add_u32_e32 v24, vcc, v11, v13 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v18, v10, 0 +; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v14, vcc +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v18, v23, 0 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v24, v10 +; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v25, v11, vcc +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v13 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; TONGA-NEXT: v_add_u32_e32 v23, vcc, v19, v10 +; TONGA-NEXT: v_addc_u32_e32 v24, vcc, v18, v11, vcc +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v21, v23, 0 +; TONGA-NEXT: v_mul_lo_u32 v18, v21, v24 +; TONGA-NEXT: v_mul_lo_u32 v19, v22, v23 +; TONGA-NEXT: v_mul_hi_u32 v21, v23, v10 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v24, v10, 0 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v18, v11 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v19, v11 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v24, v11, 0 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v21, v18 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v18, v13 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v19, v14, vcc +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v13, v10 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v23, v10 +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, v24, v11, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v16, v18 +; TONGA-NEXT: v_xor_b32_e32 v19, v10, v18 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v19, v14, 0 +; TONGA-NEXT: v_mul_hi_u32 v21, v19, v13 +; TONGA-NEXT: v_addc_u32_e32 v17, vcc, v17, v18, vcc +; TONGA-NEXT: v_xor_b32_e32 v17, v17, v18 +; TONGA-NEXT: v_add_u32_e32 v21, vcc, v21, v10 +; TONGA-NEXT: v_addc_u32_e32 v22, vcc, 0, v11, vcc +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v13, 0 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v17, v14, 0 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v21, v10 +; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v22, v11, vcc +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v10, v13 +; TONGA-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc +; TONGA-NEXT: v_mul_lo_u32 v14, v15, v10 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v15, v13, 0 +; TONGA-NEXT: v_mul_lo_u32 v13, v20, v13 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v14, v11 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v13, v11 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v17, v11 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v19, v10 +; TONGA-NEXT: v_subb_u32_e64 v13, s[0:1], v13, v20, vcc +; TONGA-NEXT: v_sub_u32_e64 v14, s[0:1], v10, v15 +; TONGA-NEXT: v_subbrev_u32_e64 v19, s[2:3], 0, v13, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v20 ; TONGA-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v18, v10 -; TONGA-NEXT: v_subb_u32_e64 v15, s[0:1], v15, v10, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v21, s[0:1], v17, v11 -; TONGA-NEXT: v_subbrev_u32_e64 v15, s[0:1], 0, v15, s[0:1] -; TONGA-NEXT: v_subb_u32_e32 v14, vcc, v22, v14, vcc -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v20 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v14, v10 -; TONGA-NEXT: v_cndmask_b32_e64 v15, v18, v15, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v13, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v14, v10 -; TONGA-NEXT: v_cndmask_b32_e32 v10, v18, v11, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v17, v17, v21, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; TONGA-NEXT: v_cndmask_b32_e32 v11, v13, v17, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v10, v14, v15, vcc -; TONGA-NEXT: v_xor_b32_e32 v11, v11, v19 -; TONGA-NEXT: v_xor_b32_e32 v13, v10, v19 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v11, v19 -; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v13, v19, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_8 -; TONGA-NEXT: .LBB12_7: +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v15 +; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v17, v11, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v22, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v19, v20 +; TONGA-NEXT: v_subb_u32_e64 v13, s[0:1], v13, v20, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v20 +; TONGA-NEXT: v_cndmask_b32_e64 v21, v21, v22, s[2:3] +; TONGA-NEXT: v_sub_u32_e64 v22, s[0:1], v14, v15 +; TONGA-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v10, v15 +; TONGA-NEXT: v_subbrev_u32_e64 v13, s[0:1], 0, v13, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v20 +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v21 +; TONGA-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v22, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; TONGA-NEXT: v_cndmask_b32_e64 v13, v19, v13, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; TONGA-NEXT: v_xor_b32_e32 v10, v10, v18 +; TONGA-NEXT: v_xor_b32_e32 v11, v11, v18 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v10, v18 +; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc +; TONGA-NEXT: s_cbranch_execnz .LBB12_6 +; TONGA-NEXT: .LBB12_5: ; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v12 ; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v12 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 @@ -6492,13 +6437,13 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 ; TONGA-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc ; TONGA-NEXT: v_mov_b32_e32 v11, 0 -; TONGA-NEXT: .LBB12_8: +; TONGA-NEXT: .LBB12_6: ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_or_b32_e32 v13, v5, v1 ; TONGA-NEXT: v_mov_b32_e32 v12, 0 ; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; TONGA-NEXT: s_cbranch_vccz .LBB12_16 -; TONGA-NEXT: ; %bb.9: +; TONGA-NEXT: s_cbranch_vccz .LBB12_15 +; TONGA-NEXT: ; %bb.7: ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1 ; TONGA-NEXT: v_add_u32_e32 v13, vcc, v0, v12 ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc @@ -6604,8 +6549,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v16 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v5, v16 ; TONGA-NEXT: v_subb_u32_e32 v13, vcc, v1, v16, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_11 -; TONGA-NEXT: .LBB12_10: +; TONGA-NEXT: s_cbranch_execnz .LBB12_9 +; TONGA-NEXT: .LBB12_8: ; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v0 ; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 ; TONGA-NEXT: v_mov_b32_e32 v13, 0 @@ -6624,12 +6569,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v0, v1 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 ; TONGA-NEXT: v_cndmask_b32_e32 v12, v1, v4, vcc -; TONGA-NEXT: .LBB12_11: +; TONGA-NEXT: .LBB12_9: ; TONGA-NEXT: v_or_b32_e32 v1, v7, v3 ; TONGA-NEXT: v_mov_b32_e32 v0, 0 ; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; TONGA-NEXT: s_cbranch_vccz .LBB12_17 -; TONGA-NEXT: ; %bb.12: +; TONGA-NEXT: s_cbranch_vccz .LBB12_16 +; TONGA-NEXT: ; %bb.10: ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v0 ; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc @@ -6735,8 +6680,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v15 ; TONGA-NEXT: v_sub_u32_e32 v14, vcc, v0, v15 ; TONGA-NEXT: v_subb_u32_e32 v15, vcc, v1, v15, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_14 -; TONGA-NEXT: .LBB12_13: +; TONGA-NEXT: s_cbranch_execnz .LBB12_12 +; TONGA-NEXT: .LBB12_11: ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v2 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, 0, v2 ; TONGA-NEXT: v_mov_b32_e32 v15, 0 @@ -6755,7 +6700,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc -; TONGA-NEXT: .LBB12_14: +; TONGA-NEXT: .LBB12_12: ; TONGA-NEXT: v_mov_b32_e32 v0, s4 ; TONGA-NEXT: v_mov_b32_e32 v1, s5 ; TONGA-NEXT: s_add_u32 s0, s4, 16 @@ -6765,13 +6710,16 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; TONGA-NEXT: s_endpgm +; TONGA-NEXT: .LBB12_13: +; TONGA-NEXT: ; implicit-def: $vgpr8_vgpr9 +; TONGA-NEXT: s_branch .LBB12_2 +; TONGA-NEXT: .LBB12_14: +; TONGA-NEXT: s_branch .LBB12_5 ; TONGA-NEXT: .LBB12_15: -; TONGA-NEXT: s_branch .LBB12_7 -; TONGA-NEXT: .LBB12_16: ; TONGA-NEXT: ; implicit-def: $vgpr12_vgpr13 -; TONGA-NEXT: s_branch .LBB12_10 -; TONGA-NEXT: .LBB12_17: -; TONGA-NEXT: s_branch .LBB12_13 +; TONGA-NEXT: s_branch .LBB12_8 +; TONGA-NEXT: .LBB12_16: +; TONGA-NEXT: s_branch .LBB12_11 ; ; EG-LABEL: srem_v4i64: ; EG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index d2008be4fd32a..d86a624695e96 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -51,7 +51,6 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: v_mov_b32_e32 v62, s66 ; CHECK-NEXT: v_mov_b32_e32 v63, s67 ; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] -; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] ; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[58:59] @@ -68,7 +67,6 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: flat_store_dwordx2 v[46:47], v[44:45] ; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] ; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)