diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index c3f751c1a9883..b35f9faf024bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4903,16 +4903,40 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, LLT S16 = LLT::scalar(16); LLT S32 = LLT::scalar(32); + // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 + // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 + // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d + // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp + // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n + // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp + // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n + // tmp.u = opx(V_MUL_F32, e32.u, r32.u); + // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000) + // q32.u = opx(V_ADD_F32, tmp.u, q32.u); + // q16.u = opx(V_CVT_F16_F32, q32.u); + // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n) + auto LHSExt = B.buildFPExt(S32, LHS, Flags); auto RHSExt = B.buildFPExt(S32, RHS, Flags); - - auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) + auto NegRHSExt = B.buildFNeg(S32, RHSExt); + auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) .addUse(RHSExt.getReg(0)) .setMIFlags(Flags); - - auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); - auto RDst = B.buildFPTrunc(S16, QUOT, Flags); - + auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags); + MachineInstrBuilder Err; + if (ST.hasMadMacF32Insts()) { + Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags); + Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags); + Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags); + } else { + Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags); + Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags); + Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags); + } + auto Tmp = B.buildFMul(S32, Err, Rcp, Flags); + Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000)); + Quot = B.buildFAdd(S32, Tmp, Quot, Flags); + auto RDst = B.buildFPTrunc(S16, Quot, Flags); B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) .addUse(RDst.getReg(0)) .addUse(RHS) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ccd8822570f63..6172687f4b4ab 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10693,19 +10693,48 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { return FastLowered; SDLoc SL(Op); - SDValue Src0 = Op.getOperand(0); - SDValue Src1 = Op.getOperand(1); - - SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); - SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); - - SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); - SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); - - SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); - SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); - return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); + // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 + // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 + // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d + // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp + // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n + // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp + // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n + // tmp.u = opx(V_MUL_F32, e32.u, r32.u); + // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000) + // q32.u = opx(V_ADD_F32, tmp.u, q32.u); + // q16.u = opx(V_CVT_F16_F32, q32.u); + // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n) + + // We will use ISD::FMA on targets that don't support ISD::FMAD. + unsigned FMADOpCode = + isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA; + + SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS); + SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS); + SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt); + SDValue Rcp = + DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags()); + SDValue Quot = + DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags()); + SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt, + Op->getFlags()); + Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags()); + Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt, + Op->getFlags()); + SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags()); + SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp); + TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast, + DAG.getConstant(0xff800000, SL, MVT::i32)); + Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast); + Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags()); + SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, + DAG.getConstant(0, SL, MVT::i32)); + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS, + Op->getFlags()); } // Faster 2.5 ULP division that does not support denormals. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 1a98285230b2c..5ba036c386a40 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -57,24 +57,59 @@ define half @v_fdiv_f16(half %a, half %b) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_fdiv_f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_fdiv_f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v3 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_fdiv_f16: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -83,27 +118,71 @@ define half @v_fdiv_f16(half %a, half %b) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half %a, %b @@ -188,24 +267,59 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_f16_ulp25: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_fdiv_f16_ulp25: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_fdiv_f16_ulp25: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v3 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -214,27 +328,71 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f16_ulp25: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_f16_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_f16_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f16_ulp25: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half %a, %b @@ -670,44 +828,113 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX8-NEXT: v_rcp_f32_e32 v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5 -; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_fdiv_v2f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v2, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v10, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v7, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX8-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v4, v6, v3 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_fdiv_v2f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v2, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v10, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v9, v4 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v8 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v9 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v8, v5, v7 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v9, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v8, v5, v7 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v6, v3 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_fdiv_v2f16: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5 -; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v2, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v9, v10, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v8 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v9, -v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v9, v9, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX9-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v4, v6, v3 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -715,33 +942,103 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5 -; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v7, v7 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v8, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v6, v7 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v7 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v5, v3 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_v2f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v9, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v8, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v9, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v12, -v3, v10 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v13, -v4, v11 +; GFX10-IEEE-NEXT: v_add_f32_e32 v12, v12, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v13, v13, v9 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v12, v12, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v13, v13, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v12, v10 +; GFX10-IEEE-NEXT: v_add_f32_e32 v11, v13, v11 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v10 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v11 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v10 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v9, v5 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v7, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v10, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v11, v9, v7 +; GFX10-FLUSH-NEXT: v_mad_f32 v12, -v3, v10, v8 +; GFX10-FLUSH-NEXT: v_mad_f32 v13, -v4, v11, v9 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v10, v12, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v11, v13, v7 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v10, v8 +; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v4, v11, v9 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v10 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16: ; GFX11: ; %bb.0: @@ -749,12 +1046,24 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4 +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4 +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -897,44 +1206,113 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_v2f16_ulp25: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX8-NEXT: v_rcp_f32_e32 v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5 -; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_fdiv_v2f16_ulp25: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v2, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v10, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v7, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX8-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v4, v6, v3 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_fdiv_v2f16_ulp25: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v2, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v10, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v9, v4 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v8 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v9 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v8, v5, v7 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v9, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v8, v5, v7 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v6, v3 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_fdiv_v2f16_ulp25: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5 -; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v2, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v9, v10, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v8 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v9, -v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v9, v9, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX9-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v4, v6, v3 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -942,33 +1320,103 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5 -; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v7, v7 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v8, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v6, v7 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v7 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v5, v3 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_v2f16_ulp25: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_fdiv_v2f16_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v9, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v8, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v9, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v12, -v3, v10 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v13, -v4, v11 +; GFX10-IEEE-NEXT: v_add_f32_e32 v12, v12, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v13, v13, v9 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v12, v12, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v13, v13, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v12, v10 +; GFX10-IEEE-NEXT: v_add_f32_e32 v11, v13, v11 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v10 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v11 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v9 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v10 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f16_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v9, v5 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v7, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v10, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v11, v9, v7 +; GFX10-FLUSH-NEXT: v_mad_f32 v12, -v3, v10, v8 +; GFX10-FLUSH-NEXT: v_mad_f32 v13, -v4, v11, v9 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v10, v12, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v11, v13, v7 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v10, v8 +; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v4, v11, v9 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v10 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_ulp25: ; GFX11: ; %bb.0: @@ -976,12 +1424,24 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4 +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4 +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1061,36 +1521,103 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_rcp_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_rcp_v2f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_rcp_v2f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_rcp_v2f16: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1104,43 +1631,122 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rcp_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_rcp_v2f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_v2f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x @@ -1218,36 +1824,103 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_neg_rcp_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_neg_rcp_v2f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_neg_rcp_v2f16: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1261,43 +1934,122 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_neg_rcp_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_neg_rcp_v2f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_neg_rcp_v2f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_neg_rcp_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x @@ -1385,38 +2137,106 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_rcp_v2f16_fabs: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_rcp_v2f16_fabs: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_rcp_v2f16_fabs: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_rcp_v2f16_fabs: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1427,50 +2247,131 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-FLUSH-LABEL: v_rcp_v2f16_fabs: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0 -; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, 1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rcp_v2f16_fabs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_rcp_v2f16_fabs: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_v2f16_fabs: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4 +; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fdiv = fdiv <2 x half> , %x.fabs @@ -1558,38 +2459,106 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_neg_rcp_v2f16_fabs: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_neg_rcp_v2f16_fabs: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16_fabs: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_neg_rcp_v2f16_fabs: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1600,50 +2569,131 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16_fabs: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0 -; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, -1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_neg_rcp_v2f16_fabs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_neg_rcp_v2f16_fabs: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0 +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_neg_rcp_v2f16_fabs: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_neg_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4 +; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fdiv = fdiv <2 x half> , %x.fabs @@ -1881,36 +2931,103 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_rcp_v2f16_ulp25: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_rcp_v2f16_ulp25: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_rcp_v2f16_ulp25: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1924,43 +3041,122 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rcp_v2f16_ulp25: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_rcp_v2f16_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_v2f16_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_ulp25: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x @@ -2251,24 +3447,60 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) { ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-FLUSH-NEXT: ; return to shader part epilog ; -; GFX8-LABEL: s_fdiv_f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX8-NEXT: v_rcp_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_div_fixup_f16 v0, v0, v1, s0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-IEEE-LABEL: s_fdiv_f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v2, v0 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v1, v2 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v4, -v0, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v4, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v4, v4, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX8-IEEE-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-IEEE-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; GFX8-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-IEEE-NEXT: ; return to shader part epilog +; +; GFX8-FLUSH-LABEL: s_fdiv_f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v2, v0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v1, v2 +; GFX8-FLUSH-NEXT: v_mad_f32 v4, -v0, v3, v1 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v2 +; GFX8-FLUSH-NEXT: v_mad_f32 v0, -v0, v3, v1 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; GFX8-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-FLUSH-NEXT: ; return to shader part epilog ; ; GFX9-IEEE-LABEL: s_fdiv_f16: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v0 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v1, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v4, -v0, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v4, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v4, v4, v2 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX9-IEEE-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-IEEE-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v1, s0 @@ -2278,28 +3510,72 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) { ; GFX9-FLUSH-LABEL: s_fdiv_f16: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v0 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, -v2, v1, s0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v1, v3, v0 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, -v2, v1, s0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-FLUSH-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_fdiv_f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX10-NEXT: v_rcp_f32_e32 v0, v0 -; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-IEEE-LABEL: s_fdiv_f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, s0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v0, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v1 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v0, s1, s0 +; GFX10-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-IEEE-NEXT: ; return to shader part epilog +; +; GFX10-FLUSH-LABEL: s_fdiv_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v1, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v0, v3, v2 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v1 +; GFX10-FLUSH-NEXT: v_mad_f32 v0, -v0, v3, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v0, s1, s0 +; GFX10-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-FLUSH-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fdiv_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0 +; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0 +; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog @@ -2499,42 +3775,113 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-FLUSH-NEXT: ; return to shader part epilog ; -; GFX8-LABEL: s_fdiv_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX8-NEXT: v_rcp_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, s2 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_div_fixup_f16 v0, v0, v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_div_fixup_f16 v1, v1, v2, s2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-IEEE-LABEL: s_fdiv_v2f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX8-IEEE-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s3 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v2, v0 +; GFX8-IEEE-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v1, v2 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v0, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX8-IEEE-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v3, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v5, -v4, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v1 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-IEEE-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, s0 +; GFX8-IEEE-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v1, v2, s2 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-IEEE-NEXT: ; return to shader part epilog +; +; GFX8-FLUSH-LABEL: s_fdiv_v2f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX8-FLUSH-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s3 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v2, v0 +; GFX8-FLUSH-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v1, v2 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v0, v5, v1 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX8-FLUSH-NEXT: v_mad_f32 v0, -v0, v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v1, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v3, v1 +; GFX8-FLUSH-NEXT: v_mad_f32 v5, -v4, v2, v3 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v2, v5, v1 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v4, v2, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-FLUSH-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0 +; GFX8-FLUSH-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s2 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-FLUSH-NEXT: ; return to shader part epilog ; ; GFX9-IEEE-LABEL: s_fdiv_v2f16: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 ; GFX9-IEEE-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s3 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v0 ; GFX9-IEEE-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, s2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v1, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v0, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v2 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX9-IEEE-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v5, -v4, v2 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v1 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v2 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, s0 @@ -2547,36 +3894,106 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX9-FLUSH-LABEL: s_fdiv_v2f16: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, s3 +; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v0 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v2, v1, s0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v1, v4, v0 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v2, v1, s0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v4, s2 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v5, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v4, s2 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, s2 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-FLUSH-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_fdiv_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: v_rcp_f32_e32 v0, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0 -; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-IEEE-LABEL: s_fdiv_v2f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_lshr_b32 s2, s1, 16 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX10-IEEE-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v5, s3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v5, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v8, -v0, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v1, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v8, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX10-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v0, s1, s0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v1, s2, s3 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-IEEE-NEXT: ; return to shader part epilog +; +; GFX10-FLUSH-LABEL: s_fdiv_v2f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_lshr_b32 s2, s1, 16 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX10-FLUSH-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, s3 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v5, v3 +; GFX10-FLUSH-NEXT: v_mad_f32 v8, -v0, v6, v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v1, v7, v5 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v6, v8, v2 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v3 +; GFX10-FLUSH-NEXT: v_mad_f32 v0, -v0, v6, v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v5 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v0, s1, s0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, s3 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-FLUSH-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fdiv_v2f16: ; GFX11: ; %bb.0: @@ -2584,13 +4001,25 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s3 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0 +; GFX11-NEXT: v_dual_mul_f32 v2, v2, v0 :: v_dual_mul_f32 v3, v3, v1 +; GFX11-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v5, -s2, v3, s3 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v2, v4, v0 :: v_dual_fmac_f32 v3, v5, v1 +; GFX11-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v5, -s2, v3, s3 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xff800000, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3 +; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog @@ -2896,26 +4325,77 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-FLUSH-NEXT: ; return to shader part epilog ; -; GFX8-LABEL: s_rsq_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_sqrt_f16_e32 v0, s0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_sqrt_f16_e32 v1, s0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-IEEE-LABEL: s_rsq_v2f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 +; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-IEEE-NEXT: ; return to shader part epilog +; +; GFX8-FLUSH-LABEL: s_rsq_v2f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 +; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-FLUSH-NEXT: ; return to shader part epilog ; ; GFX9-IEEE-LABEL: s_rsq_v2f16: ; GFX9-IEEE: ; %bb.0: @@ -2925,11 +4405,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 +; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 @@ -2942,50 +4434,125 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-FLUSH-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_rsq_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s1, s0, 16 -; GFX10-NEXT: v_sqrt_f16_e32 v0, s0 -; GFX10-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-IEEE-LABEL: s_rsq_v2f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16 +; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 +; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-IEEE-NEXT: ; return to shader part epilog +; +; GFX10-FLUSH-LABEL: s_rsq_v2f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 +; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 +; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-FLUSH-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_rsq_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog @@ -3876,25 +5443,75 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_rsq_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sqrt_f16_e32 v1, v0 -; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_rsq_v2f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_rsq_v2f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_rsq_v2f16: ; GFX9-IEEE: ; %bb.0: @@ -3904,10 +5521,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7 +; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -3920,38 +5549,100 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rsq_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_sqrt_f16_e32 v1, v0 -; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_rsq_v2f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rsq_v2f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rsq_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -3959,10 +5650,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) @@ -4054,25 +5757,75 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_neg_rsq_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sqrt_f16_e32 v1, v0 -; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_neg_rsq_v2f16: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_neg_rsq_v2f16: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_neg_rsq_v2f16: ; GFX9-IEEE: ; %bb.0: @@ -4082,10 +5835,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7 +; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -4098,38 +5863,100 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_neg_rsq_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_sqrt_f16_e32 v1, v0 -; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-IEEE-LABEL: v_neg_rsq_v2f16: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_neg_rsq_v2f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 +; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_neg_rsq_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -4137,10 +5964,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) @@ -4154,9 +5993,5 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) declare <2 x half> @llvm.sqrt.v2f16(<2 x half>) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10-FLUSH: {{.*}} -; GFX10-IEEE: {{.*}} ; GFX11-FLUSH: {{.*}} ; GFX11-IEEE: {{.*}} -; GFX8-FLUSH: {{.*}} -; GFX8-IEEE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index e051cc28469fa..8409e9c88aada 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -46,8 +46,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_rcp_f32_e32 v2, v2 -; VI-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: v_mul_f32_e32 v4, v0, v3 +; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 +; VI-NEXT: v_mac_f32_e32 v4, v5, v3 +; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 +; VI-NEXT: v_mul_f32_e32 v0, v0, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; VI-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 ; VI-NEXT: v_trunc_f16_e32 v0, v0 @@ -554,19 +560,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0 ; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_rcp_f32_e32 v2, v2 ; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_rcp_f32_e32 v3, v3 -; VI-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: v_mul_f32_e32 v4, v0, v3 +; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 +; VI-NEXT: v_mac_f32_e32 v4, v5, v3 +; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 +; VI-NEXT: v_mul_f32_e32 v0, v0, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; VI-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 ; VI-NEXT: v_trunc_f16_e32 v0, v0 ; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; VI-NEXT: v_mul_f32_e32 v1, v1, v3 +; VI-NEXT: v_rcp_f32_e32 v4, v3 +; VI-NEXT: v_mul_f32_e32 v5, v1, v4 +; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 +; VI-NEXT: v_mac_f32_e32 v5, v6, v4 +; VI-NEXT: v_mad_f32 v1, -v3, v5, v1 +; VI-NEXT: v_mul_f32_e32 v1, v1, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; VI-NEXT: v_add_f32_e32 v1, v1, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1 ; VI-NEXT: v_trunc_f16_e32 v1, v1 @@ -691,41 +709,65 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0 ; VI-NEXT: s_lshr_b32 s8, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s8 ; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_rcp_f32_e32 v2, v2 ; VI-NEXT: s_lshr_b32 s6, s2, 16 -; VI-NEXT: v_rcp_f32_e32 v3, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; VI-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-NEXT: v_rcp_f32_e32 v3, v2 +; VI-NEXT: s_lshr_b32 s9, s1, 16 +; VI-NEXT: s_lshr_b32 s7, s3, 16 +; VI-NEXT: v_mul_f32_e32 v4, v0, v3 +; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 +; VI-NEXT: v_mac_f32_e32 v4, v5, v3 +; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 +; VI-NEXT: v_mul_f32_e32 v0, v0, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; VI-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v3, s8 ; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_rcp_f32_e32 v4, v4 -; VI-NEXT: s_lshr_b32 s9, s1, 16 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 ; VI-NEXT: v_trunc_f16_e32 v0, v0 ; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s6 -; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; VI-NEXT: s_lshr_b32 s7, s3, 16 -; VI-NEXT: v_mul_f32_e32 v1, v1, v3 +; VI-NEXT: v_rcp_f32_e32 v4, v3 +; VI-NEXT: v_mul_f32_e32 v5, v1, v4 +; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 +; VI-NEXT: v_mac_f32_e32 v5, v6, v4 +; VI-NEXT: v_mad_f32 v1, -v3, v5, v1 +; VI-NEXT: v_mul_f32_e32 v1, v1, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; VI-NEXT: v_add_f32_e32 v1, v1, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_rcp_f32_e32 v5, v5 ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6 ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_rcp_f32_e32 v5, v4 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mul_f32_e32 v2, v2, v4 +; VI-NEXT: v_mul_f32_e32 v6, v2, v5 +; VI-NEXT: v_mad_f32 v7, -v4, v6, v2 +; VI-NEXT: v_mac_f32_e32 v6, v7, v5 +; VI-NEXT: v_mad_f32 v2, -v4, v6, v2 +; VI-NEXT: v_mul_f32_e32 v2, v2, v5 +; VI-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; VI-NEXT: v_add_f32_e32 v2, v2, v6 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 ; VI-NEXT: v_mov_b32_e32 v4, s9 ; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3 ; VI-NEXT: v_trunc_f16_e32 v2, v2 ; VI-NEXT: v_fma_f16 v2, -v2, v3, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s7 -; VI-NEXT: v_mul_f32_e32 v3, v3, v5 +; VI-NEXT: v_rcp_f32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v7, v3, v6 +; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 +; VI-NEXT: v_mac_f32_e32 v7, v8, v6 +; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 +; VI-NEXT: v_mul_f32_e32 v3, v3, v6 +; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; VI-NEXT: v_add_f32_e32 v3, v3, v7 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7 ; VI-NEXT: v_trunc_f16_e32 v3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir index e774c2c83dfd8..1f9c059c2ac60 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -44,6 +44,7 @@ body: | ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32) ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; VI-LABEL: name: test_fdiv_s16 ; VI: liveins: $vgpr0, $vgpr1 ; VI-NEXT: {{ $}} @@ -53,12 +54,24 @@ body: | ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) + ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C]] + ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16) ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX9-LABEL: name: test_fdiv_s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -68,12 +81,24 @@ body: | ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) + ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C]] + ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16) ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16 ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -85,21 +110,6 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]] ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-LABEL: name: test_fdiv_s16 - ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16) - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -141,6 +151,7 @@ body: | ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; VI-LABEL: name: test_fdiv_s32_denorms_on ; VI: liveins: $vgpr0, $vgpr1 ; VI-NEXT: {{ $}} @@ -160,6 +171,7 @@ body: | ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-LABEL: name: test_fdiv_s32_denorms_on ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -179,6 +191,7 @@ body: | ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_on ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -187,6 +200,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]] ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; ; GFX10-LABEL: name: test_fdiv_s32_denorms_on ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -246,6 +260,7 @@ body: | ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; VI-LABEL: name: test_fdiv_s32_denorms_off ; VI: liveins: $vgpr0, $vgpr1 ; VI-NEXT: {{ $}} @@ -267,6 +282,7 @@ body: | ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-LABEL: name: test_fdiv_s32_denorms_off ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -288,6 +304,7 @@ body: | ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -296,6 +313,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]] ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; ; GFX10-LABEL: name: test_fdiv_s32_denorms_off ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -357,6 +375,7 @@ body: | ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp ; VI: liveins: $vgpr0, $vgpr1 ; VI-NEXT: {{ $}} @@ -378,6 +397,7 @@ body: | ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -399,6 +419,7 @@ body: | ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -407,6 +428,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; ; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -473,6 +495,7 @@ body: | ; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1) ; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; VI-LABEL: name: test_fdiv_s64 ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} @@ -492,6 +515,7 @@ body: | ; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; GFX9-LABEL: name: test_fdiv_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -511,6 +535,7 @@ body: | ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64 ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -527,6 +552,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]] ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64) + ; ; GFX10-LABEL: name: test_fdiv_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -603,6 +629,7 @@ body: | ; SI-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; VI-LABEL: name: test_fdiv_v2s32 ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} @@ -641,6 +668,7 @@ body: | ; VI-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; GFX9-LABEL: name: test_fdiv_v2s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -679,6 +707,7 @@ body: | ; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32 ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -692,6 +721,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[INT1]] ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; GFX10-LABEL: name: test_fdiv_v2s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -776,6 +806,7 @@ body: | ; SI-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; VI-LABEL: name: test_fdiv_v2s32_flags ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} @@ -810,6 +841,7 @@ body: | ; VI-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; GFX9-LABEL: name: test_fdiv_v2s32_flags ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -844,6 +876,7 @@ body: | ; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32_flags ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -857,6 +890,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[UV1]], [[INT1]] ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; GFX10-LABEL: name: test_fdiv_v2s32_flags ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -949,6 +983,7 @@ body: | ; SI-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32) ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32) ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; ; VI-LABEL: name: test_fdiv_v3s32 ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; VI-NEXT: {{ $}} @@ -995,6 +1030,7 @@ body: | ; VI-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32) ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32) ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; ; GFX9-LABEL: name: test_fdiv_v3s32 ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} @@ -1041,6 +1077,7 @@ body: | ; GFX9-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s32 ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -1056,6 +1093,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[INT2]] ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32) ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; ; GFX10-LABEL: name: test_fdiv_v3s32 ; GFX10: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -1162,6 +1200,7 @@ body: | ; SI-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64) ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64) ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; ; VI-LABEL: name: test_fdiv_v2s64 ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: {{ $}} @@ -1196,6 +1235,7 @@ body: | ; VI-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64) ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64) ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; ; GFX9-LABEL: name: test_fdiv_v2s64 ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: {{ $}} @@ -1230,6 +1270,7 @@ body: | ; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s64 ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -1258,6 +1299,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]] ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64) ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; ; GFX10-LABEL: name: test_fdiv_v2s64 ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX10-NEXT: {{ $}} @@ -1355,6 +1397,7 @@ body: | ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; ; VI-LABEL: name: test_fdiv_v2s16 ; VI: liveins: $vgpr0, $vgpr1 ; VI-NEXT: {{ $}} @@ -1371,15 +1414,36 @@ body: | ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) + ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16) ; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) + ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) + ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16) ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT1]](s16) ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT3]](s16) @@ -1387,6 +1451,7 @@ body: | ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; ; GFX9-LABEL: name: test_fdiv_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -1403,18 +1468,40 @@ body: | ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) + ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16) ; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) + ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) + ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s16 ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -1435,34 +1522,6 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[INT1]] ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL]](s16), [[FMUL1]](s16) ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) - ; GFX10-LABEL: name: test_fdiv_v2s16 - ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16) - ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) - ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_FDIV %0, %1 @@ -1546,6 +1605,7 @@ body: | ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC2]](s16) ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; ; VI-LABEL: name: test_fdiv_v3s16 ; VI: liveins: $vgpr0, $vgpr1 ; VI-NEXT: {{ $}} @@ -1568,27 +1628,59 @@ body: | ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) + ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16) ; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) + ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) + ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16) ; VI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; VI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) + ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]] ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) - ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] - ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32) + ; VI-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] + ; VI-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]] + ; VI-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]] + ; VI-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]] + ; VI-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]] + ; VI-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]] + ; VI-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]] + ; VI-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]] + ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]] + ; VI-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]] + ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32) ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16) ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; ; GFX9-LABEL: name: test_fdiv_v3s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -1611,27 +1703,59 @@ body: | ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) + ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16) ; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) + ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) + ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; GFX9-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) + ; GFX9-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]] ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) - ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] - ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32) + ; GFX9-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] + ; GFX9-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]] + ; GFX9-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]] + ; GFX9-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]] + ; GFX9-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]] + ; GFX9-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]] + ; GFX9-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]] + ; GFX9-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]] + ; GFX9-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]] + ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32) ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16) ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s16 ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -1663,49 +1787,6 @@ body: | ; GFX9-UNSAFE-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL2]](s16) ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; GFX9-UNSAFE-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-LABEL: name: test_fdiv_v3s16 - ; GFX10: liveins: $vgpr0, $vgpr1 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16) - ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) - ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) - ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) - ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) - ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] - ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32) - ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16) - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) - ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) - ; GFX10-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_FDIV %0, %1 @@ -1816,6 +1897,7 @@ body: | ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; ; VI-LABEL: name: test_fdiv_v4s16 ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; VI-NEXT: {{ $}} @@ -1842,27 +1924,68 @@ body: | ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) + ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16) ; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) + ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) + ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16) ; VI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; VI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16) + ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]] ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) - ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] - ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32) + ; VI-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] + ; VI-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]] + ; VI-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]] + ; VI-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]] + ; VI-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]] + ; VI-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]] + ; VI-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]] + ; VI-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]] + ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]] + ; VI-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]] + ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32) ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16) ; VI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) ; VI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16) + ; VI-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]] ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32) - ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] - ; VI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32) + ; VI-NEXT: [[FMUL15:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] + ; VI-NEXT: [[FMUL16:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FMUL15]] + ; VI-NEXT: [[FADD12:%[0-9]+]]:_(s32) = G_FADD [[FMUL16]], [[FPEXT6]] + ; VI-NEXT: [[FMUL17:%[0-9]+]]:_(s32) = G_FMUL [[FADD12]], [[INT6]] + ; VI-NEXT: [[FADD13:%[0-9]+]]:_(s32) = G_FADD [[FMUL17]], [[FMUL15]] + ; VI-NEXT: [[FMUL18:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FADD13]] + ; VI-NEXT: [[FADD14:%[0-9]+]]:_(s32) = G_FADD [[FMUL18]], [[FPEXT6]] + ; VI-NEXT: [[FMUL19:%[0-9]+]]:_(s32) = G_FMUL [[FADD14]], [[INT6]] + ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[FMUL19]], [[C1]] + ; VI-NEXT: [[FADD15:%[0-9]+]]:_(s32) = G_FADD [[AND3]], [[FADD13]] + ; VI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD15]](s32) ; VI-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16) ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT1]](s16) ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT3]](s16) @@ -1876,6 +1999,7 @@ body: | ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; ; GFX9-LABEL: name: test_fdiv_v4s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -1902,32 +2026,74 @@ body: | ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) + ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16) ; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) ; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) + ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) + ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) ; GFX9-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16) + ; GFX9-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]] ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) - ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] - ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32) + ; GFX9-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] + ; GFX9-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]] + ; GFX9-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]] + ; GFX9-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]] + ; GFX9-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]] + ; GFX9-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]] + ; GFX9-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]] + ; GFX9-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]] + ; GFX9-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]] + ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32) ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16) ; GFX9-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) ; GFX9-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16) + ; GFX9-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]] ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32) - ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] - ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32) + ; GFX9-NEXT: [[FMUL15:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] + ; GFX9-NEXT: [[FMUL16:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FMUL15]] + ; GFX9-NEXT: [[FADD12:%[0-9]+]]:_(s32) = G_FADD [[FMUL16]], [[FPEXT6]] + ; GFX9-NEXT: [[FMUL17:%[0-9]+]]:_(s32) = G_FMUL [[FADD12]], [[INT6]] + ; GFX9-NEXT: [[FADD13:%[0-9]+]]:_(s32) = G_FADD [[FMUL17]], [[FMUL15]] + ; GFX9-NEXT: [[FMUL18:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FADD13]] + ; GFX9-NEXT: [[FADD14:%[0-9]+]]:_(s32) = G_FADD [[FMUL18]], [[FPEXT6]] + ; GFX9-NEXT: [[FMUL19:%[0-9]+]]:_(s32) = G_FMUL [[FADD14]], [[INT6]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[FMUL19]], [[C1]] + ; GFX9-NEXT: [[FADD15:%[0-9]+]]:_(s32) = G_FADD [[AND3]], [[FADD13]] + ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD15]](s32) ; GFX9-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_v4s16 ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -1964,58 +2130,6 @@ body: | ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL2]](s16), [[FMUL3]](s16) ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) - ; GFX10-LABEL: name: test_fdiv_v4s16 - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; GFX10-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX10-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) - ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16) - ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) - ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) - ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) - ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] - ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) - ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) - ; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16) - ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) - ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] - ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32) - ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16) - ; GFX10-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) - ; GFX10-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16) - ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32) - ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] - ; GFX10-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32) - ; GFX10-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<4 x s16>) = G_FDIV %0, %1 @@ -2052,6 +2166,7 @@ body: | ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32) ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; VI-LABEL: name: test_fdiv_s16_constant_one_rcp ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} @@ -2060,6 +2175,7 @@ body: | ; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -2068,6 +2184,7 @@ body: | ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX9-UNSAFE: liveins: $vgpr0 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -2076,6 +2193,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -2122,6 +2240,7 @@ body: | ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32) ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} @@ -2131,6 +2250,7 @@ body: | ; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -2140,6 +2260,7 @@ body: | ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX9-UNSAFE: liveins: $vgpr0 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -2149,6 +2270,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -2190,6 +2312,7 @@ body: | ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; VI-LABEL: name: test_fdiv_s32_constant_one_rcp ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} @@ -2208,6 +2331,7 @@ body: | ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-LABEL: name: test_fdiv_s32_constant_one_rcp ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -2226,12 +2350,14 @@ body: | ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_one_rcp ; GFX9-UNSAFE: liveins: $vgpr0 ; GFX9-UNSAFE-NEXT: {{ $}} ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s32) ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32) + ; ; GFX10-LABEL: name: test_fdiv_s32_constant_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -2281,6 +2407,7 @@ body: | ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; VI-LABEL: name: test_fdiv_s32_constant_negative_one_rcp ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} @@ -2300,6 +2427,7 @@ body: | ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-LABEL: name: test_fdiv_s32_constant_negative_one_rcp ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -2319,6 +2447,7 @@ body: | ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_negative_one_rcp ; GFX9-UNSAFE: liveins: $vgpr0 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -2326,6 +2455,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s32) ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32) + ; ; GFX10-LABEL: name: test_fdiv_s32_constant_negative_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -2389,6 +2519,7 @@ body: | ; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1) ; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; VI-LABEL: name: test_fdiv_s64_constant_one_rcp ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} @@ -2407,6 +2538,7 @@ body: | ; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; GFX9-LABEL: name: test_fdiv_s64_constant_one_rcp ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -2425,6 +2557,7 @@ body: | ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -2440,6 +2573,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]] ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64) + ; ; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -2503,6 +2637,7 @@ body: | ; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1) ; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; VI-LABEL: name: test_fdiv_s64_constant_negative_one_rcp ; VI: liveins: $vgpr0_vgpr1 ; VI-NEXT: {{ $}} @@ -2522,6 +2657,7 @@ body: | ; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; GFX9-LABEL: name: test_fdiv_s64_constant_negative_one_rcp ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -2541,6 +2677,7 @@ body: | ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -2557,6 +2694,7 @@ body: | ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]] ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64) + ; ; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 7c89efd0a713c..0c6805e3eba59 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -60,15 +60,21 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX8-NEXT: v_rcp_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v0 +; GFX8-NEXT: v_rcp_f32_e32 v3, v0 +; GFX8-NEXT: v_mul_f32_e32 v7, v1, v3 +; GFX8-NEXT: v_mad_f32 v8, -v0, v7, v1 +; GFX8-NEXT: v_mac_f32_e32 v7, v8, v3 +; GFX8-NEXT: v_mad_f32 v0, -v0, v7, v1 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: v_div_fixup_f16 v2, v6, v2, v5 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX8-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -82,9 +88,17 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX9-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v4, v5, v3 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -100,9 +114,17 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX10-NEXT: v_mad_f32 v7, -v3, v6, v5 +; GFX10-NEXT: v_mac_f32_e32 v6, v7, v4 +; GFX10-NEXT: v_mad_f32 v3, -v3, v6, v5 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm @@ -120,11 +142,23 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll index 301299daaa61f..2eb35977b8160 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1444,12 +1444,19 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 -; VI-NEXT: s_movk_i32 s4, 0x7000 +; VI-NEXT: s_mov_b32 s4, 0x46000000 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; VI-NEXT: v_rcp_f32_e32 v1, v1 -; VI-NEXT: v_mul_f32_e32 v1, 0x46000000, v1 +; VI-NEXT: v_rcp_f32_e32 v2, v1 +; VI-NEXT: v_mul_f32_e32 v3, 0x46000000, v2 +; VI-NEXT: v_mad_f32 v4, -v1, v3, s4 +; VI-NEXT: v_mac_f32_e32 v3, v4, v2 +; VI-NEXT: v_mad_f32 v1, -v1, v3, s4 +; VI-NEXT: v_mul_f32_e32 v1, v1, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; VI-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VI-NEXT: s_movk_i32 s4, 0x7000 ; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1457,12 +1464,18 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 -; GFX10-NEXT: s_mov_b32 s4, 0x46000000 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_fma_mixlo_f16 v1, v1, s4, 0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x46000000, v2 +; GFX10-NEXT: v_mad_f32 v4, -v1, v3, 0x46000000 +; GFX10-NEXT: v_mac_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mad_f32 v1, -v1, v3, 0x46000000 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1478,8 +1491,18 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v1, v1, s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v2, 0x46000000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000 ; GFX11-NEXT: s_setpc_b64 s[30:31] %shl = shl nuw i32 1, %cnt @@ -1551,8 +1574,14 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1 ; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; VI-NEXT: v_rcp_f32_e32 v1, v1 -; VI-NEXT: v_add_f32_e32 v1, v1, v1 +; VI-NEXT: v_rcp_f32_e32 v2, v1 +; VI-NEXT: v_add_f32_e32 v3, v2, v2 +; VI-NEXT: v_mad_f32 v4, -v1, v3, 2.0 +; VI-NEXT: v_mac_f32_e32 v3, v4, v2 +; VI-NEXT: v_mad_f32 v1, -v1, v3, 2.0 +; VI-NEXT: v_mul_f32_e32 v1, v1, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; VI-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; VI-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1563,8 +1592,14 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1 ; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_mad_f32 v4, -v1, v3, 2.0 +; GFX10-NEXT: v_mac_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mad_f32 v1, -v1, v3, 2.0 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1573,13 +1608,23 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1 +; GFX11-NEXT: s_mov_b32 s0, 2.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX11-NEXT: v_add_f32_e32 v2, v1, v1 +; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 7c5d73ab66b47..b3432c457d9a4 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -109,8 +109,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; VI-NEXT: v_rcp_f32_e32 v5, v5 -; VI-NEXT: v_mul_f32_e32 v3, v3, v5 +; VI-NEXT: v_rcp_f32_e32 v6, v5 +; VI-NEXT: v_mul_f32_e32 v7, v3, v6 +; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 +; VI-NEXT: v_mac_f32_e32 v7, v8, v6 +; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 +; VI-NEXT: v_mul_f32_e32 v3, v3, v6 +; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; VI-NEXT: v_add_f32_e32 v3, v3, v7 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 ; VI-NEXT: v_trunc_f16_e32 v3, v3 @@ -126,10 +132,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 @@ -146,10 +161,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 +; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 +; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 @@ -166,15 +190,28 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -191,16 +228,29 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: s_clause 0x1 ; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7] ; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX1150-NEXT: s_waitcnt vmcnt(1) +; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v3, v3 -; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX1150-NEXT: s_nop 0 @@ -1974,8 +2024,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_rcp_f32_e32 v7, v7 -; VI-NEXT: v_mul_f32_e32 v5, v5, v7 +; VI-NEXT: v_rcp_f32_e32 v8, v7 +; VI-NEXT: v_mul_f32_e32 v9, v5, v8 +; VI-NEXT: v_mad_f32 v10, -v7, v9, v5 +; VI-NEXT: v_mac_f32_e32 v9, v10, v8 +; VI-NEXT: v_mad_f32 v5, -v7, v9, v5 +; VI-NEXT: v_mul_f32_e32 v5, v5, v8 +; VI-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; VI-NEXT: v_add_f32_e32 v5, v5, v9 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 ; VI-NEXT: v_trunc_f16_e32 v5, v5 @@ -1983,8 +2039,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_rcp_f32_e32 v6, v6 -; VI-NEXT: v_mul_f32_e32 v5, v5, v6 +; VI-NEXT: v_rcp_f32_e32 v7, v6 +; VI-NEXT: v_mul_f32_e32 v8, v5, v7 +; VI-NEXT: v_mad_f32 v9, -v6, v8, v5 +; VI-NEXT: v_mac_f32_e32 v8, v9, v7 +; VI-NEXT: v_mad_f32 v5, -v6, v8, v5 +; VI-NEXT: v_mul_f32_e32 v5, v5, v7 +; VI-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; VI-NEXT: v_add_f32_e32 v5, v5, v8 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 ; VI-NEXT: v_trunc_f16_e32 v5, v5 @@ -2001,21 +2063,38 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_rcp_f32_e32 v7, v7 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v5, v5, v7 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX9-NEXT: v_mad_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 +; GFX9-NEXT: v_mac_f32_e32 v5, v8, v7 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1 -; GFX9-NEXT: v_trunc_f16_e32 v4, v4 -; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1 +; GFX9-NEXT: v_mad_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_div_fixup_f16 v1, v1, v6, v4 +; GFX9-NEXT: v_trunc_f16_e32 v1, v1 +; GFX9-NEXT: v_fma_f16 v1, -v1, v6, v4 ; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -2030,18 +2109,35 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 +; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 +; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX10-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-NEXT: v_mul_f32_e32 v7, v4, v6 +; GFX10-NEXT: v_mad_f32 v8, -v5, v7, v4 +; GFX10-NEXT: v_mac_f32_e32 v7, v8, v6 +; GFX10-NEXT: v_mad_f32 v4, -v5, v7, v4 +; GFX10-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v7 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v4, v4 ; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 @@ -2059,28 +2155,52 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v7, v7 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v7 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v7 +; GFX11-NEXT: v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v7 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1 -; GFX11-NEXT: v_trunc_f16_e32 v4, v4 +; GFX11-NEXT: v_div_fixup_f16 v1, v1, v6, v4 +; GFX11-NEXT: v_trunc_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1 +; GFX11-NEXT: v_fma_f16 v1, -v1, v6, v4 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -2098,31 +2218,55 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 ; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX1150-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v3, v5 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 ; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v5, v4, v3 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX1150-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5 +; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v5 +; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1 +; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX1150-NEXT: s_nop 0 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2364,8 +2508,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_rcp_f32_e32 v9, v9 -; VI-NEXT: v_mul_f32_e32 v7, v7, v9 +; VI-NEXT: v_rcp_f32_e32 v10, v9 +; VI-NEXT: v_mul_f32_e32 v11, v7, v10 +; VI-NEXT: v_mad_f32 v12, -v9, v11, v7 +; VI-NEXT: v_mac_f32_e32 v11, v12, v10 +; VI-NEXT: v_mad_f32 v7, -v9, v11, v7 +; VI-NEXT: v_mul_f32_e32 v7, v7, v10 +; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; VI-NEXT: v_add_f32_e32 v7, v7, v11 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 ; VI-NEXT: v_trunc_f16_e32 v7, v7 @@ -2373,8 +2523,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_rcp_f32_e32 v8, v8 -; VI-NEXT: v_mul_f32_e32 v7, v7, v8 +; VI-NEXT: v_rcp_f32_e32 v9, v8 +; VI-NEXT: v_mul_f32_e32 v10, v7, v9 +; VI-NEXT: v_mad_f32 v11, -v8, v10, v7 +; VI-NEXT: v_mac_f32_e32 v10, v11, v9 +; VI-NEXT: v_mad_f32 v7, -v8, v10, v7 +; VI-NEXT: v_mul_f32_e32 v7, v7, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; VI-NEXT: v_add_f32_e32 v7, v7, v10 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 ; VI-NEXT: v_trunc_f16_e32 v7, v7 @@ -2384,8 +2540,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-NEXT: v_or_b32_e32 v3, v3, v6 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; VI-NEXT: v_rcp_f32_e32 v8, v8 -; VI-NEXT: v_mul_f32_e32 v6, v6, v8 +; VI-NEXT: v_rcp_f32_e32 v9, v8 +; VI-NEXT: v_mul_f32_e32 v10, v6, v9 +; VI-NEXT: v_mad_f32 v11, -v8, v10, v6 +; VI-NEXT: v_mac_f32_e32 v10, v11, v9 +; VI-NEXT: v_mad_f32 v6, -v8, v10, v6 +; VI-NEXT: v_mul_f32_e32 v6, v6, v9 +; VI-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; VI-NEXT: v_add_f32_e32 v6, v6, v10 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 ; VI-NEXT: v_trunc_f16_e32 v6, v6 @@ -2393,8 +2555,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_rcp_f32_e32 v7, v7 -; VI-NEXT: v_mul_f32_e32 v6, v6, v7 +; VI-NEXT: v_rcp_f32_e32 v8, v7 +; VI-NEXT: v_mul_f32_e32 v9, v6, v8 +; VI-NEXT: v_mad_f32 v10, -v7, v9, v6 +; VI-NEXT: v_mac_f32_e32 v9, v10, v8 +; VI-NEXT: v_mad_f32 v6, -v7, v9, v6 +; VI-NEXT: v_mul_f32_e32 v6, v6, v8 +; VI-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; VI-NEXT: v_add_f32_e32 v6, v6, v9 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 ; VI-NEXT: v_trunc_f16_e32 v6, v6 @@ -2411,36 +2579,69 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0] +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v9, v8 +; GFX9-NEXT: v_rcp_f32_e32 v6, v6 +; GFX9-NEXT: v_rcp_f32_e32 v9, v9 +; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-NEXT: v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v5, v7, v6 +; GFX9-NEXT: v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX9-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v7, v7, v9 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX9-NEXT: v_mad_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX9-NEXT: v_trunc_f16_e32 v5, v5 +; GFX9-NEXT: v_mac_f32_e32 v7, v10, v9 ; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX9-NEXT: v_rcp_f32_e32 v6, v6 -; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1 -; GFX9-NEXT: v_trunc_f16_e32 v6, v6 -; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_mad_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX9-NEXT: v_div_fixup_f16 v1, v1, v8, v6 +; GFX9-NEXT: v_trunc_f16_e32 v1, v1 +; GFX9-NEXT: v_fma_f16 v1, -v1, v8, v6 ; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_rcp_f32_e32 v8, v8 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-NEXT: v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v3, v6, v5 +; GFX9-NEXT: v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v6, v6, v8 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 +; GFX9-NEXT: v_mad_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 +; GFX9-NEXT: v_mac_f32_e32 v6, v9, v8 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0 -; GFX9-NEXT: v_trunc_f16_e32 v5, v5 -; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0 +; GFX9-NEXT: v_mad_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_div_fixup_f16 v0, v0, v7, v5 +; GFX9-NEXT: v_trunc_f16_e32 v0, v0 +; GFX9-NEXT: v_fma_f16 v0, -v0, v7, v5 ; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -2455,33 +2656,66 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX10-NEXT: v_rcp_f32_e32 v5, v5 -; GFX10-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX10-NEXT: v_rcp_f32_e32 v7, v6 +; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 +; GFX10-NEXT: v_mad_f32 v9, -v6, v8, v5 +; GFX10-NEXT: v_mac_f32_e32 v8, v9, v7 +; GFX10-NEXT: v_mad_f32 v5, -v6, v8, v5 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v8 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 ; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX10-NEXT: v_rcp_f32_e32 v6, v6 -; GFX10-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX10-NEXT: v_rcp_f32_e32 v8, v7 +; GFX10-NEXT: v_mul_f32_e32 v9, v6, v8 +; GFX10-NEXT: v_mad_f32 v10, -v7, v9, v6 +; GFX10-NEXT: v_mac_f32_e32 v9, v10, v8 +; GFX10-NEXT: v_mad_f32 v6, -v7, v9, v6 +; GFX10-NEXT: v_mul_f32_e32 v6, v6, v8 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v9 +; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 ; GFX10-NEXT: v_trunc_f16_e32 v6, v6 ; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX10-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX10-NEXT: v_mad_f32 v8, -v5, v7, v3 +; GFX10-NEXT: v_mac_f32_e32 v7, v8, v6 +; GFX10-NEXT: v_mad_f32 v3, -v5, v7, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v5 -; GFX10-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_rcp_f32_e32 v7, v6 +; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 +; GFX10-NEXT: v_mad_f32 v9, -v6, v8, v5 +; GFX10-NEXT: v_mac_f32_e32 v8, v9, v7 +; GFX10-NEXT: v_mad_f32 v5, -v6, v8, v5 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v8 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 ; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 @@ -2499,50 +2733,97 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v6, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-NEXT: v_rcp_f32_e32 v9, v9 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX11-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v5, v7, v6 +; GFX11-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v7, v7, v9 ; GFX11-NEXT: v_trunc_f16_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX11-NEXT: v_rcp_f32_e32 v6, v6 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v7, v10, v9 +; GFX11-NEXT: v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1 -; GFX11-NEXT: v_trunc_f16_e32 v6, v6 +; GFX11-NEXT: v_div_fixup_f16 v1, v1, v8, v6 +; GFX11-NEXT: v_trunc_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX11-NEXT: v_fma_f16 v1, -v1, v8, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v7 ; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v8, v8 +; GFX11-NEXT: v_rcp_f32_e32 v5, v5 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0 +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v3, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0 +; GFX11-NEXT: v_mul_f32_e32 v6, v6, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_fmac_f32_e32 v6, v9, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX11-NEXT: v_rcp_f32_e32 v5, v5 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0 -; GFX11-NEXT: v_trunc_f16_e32 v5, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_div_fixup_f16 v0, v0, v7, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f16 v0, -v5, v2, v0 +; GFX11-NEXT: v_trunc_f16_e32 v0, v0 +; GFX11-NEXT: v_fma_f16 v0, -v0, v7, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 @@ -2560,55 +2841,102 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 ; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v6, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v5, v7 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v8, v8 +; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8 +; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 ; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_fmac_f16_e32 v7, v6, v5 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX1150-NEXT: v_rcp_f32_e32 v7, v7 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v0 +; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 -; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_fma_f16 v0, v5, v2, v0 -; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7 +; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0 +; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 +; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0 +; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-NEXT: v_rcp_f32_e32 v7, v7 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v6 +; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7 +; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2 ; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v6, v5, v2 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX1150-NEXT: v_rcp_f32_e32 v2, v2 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v2, v1, v2, 0 op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v2, v2, v3, v1 +; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v2, v2 -; GFX1150-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v1, v2, v3 -; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v6 +; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3 +; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5] ; GFX1150-NEXT: s_nop 0 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)