@@ -10606,19 +10606,40 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
1060610606 return FastLowered;
1060710607
1060810608 SDLoc SL(Op);
10609- SDValue Src0 = Op.getOperand(0);
10610- SDValue Src1 = Op.getOperand(1);
10611-
10612- SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10613- SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10614-
10615- SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10616- SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10617-
10618- SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10619- SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10609+ SDValue LHS = Op.getOperand(0);
10610+ SDValue RHS = Op.getOperand(1);
1062010611
10621- return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10612+ // a32.u = opx(V_CVT_F32_F16, a.u);
10613+ // b32.u = opx(V_CVT_F32_F16, b.u);
10614+ // r32.u = opx(V_RCP_F32, b32.u);
10615+ // q32.u = opx(V_MUL_F32, a32.u, r32.u);
10616+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u);
10617+ // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u);
10618+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u);
10619+ // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10620+ // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10621+ // tmp.u = opx(V_FREXP_MANT_F32, tmp.u);
10622+ // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10623+ // q16.u = opx(V_CVT_F16_F32, q32.u);
10624+ // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u);
10625+
10626+ SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10627+ SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10628+ SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10629+ SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt);
10630+ SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp);
10631+ SDValue Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
10632+ Quot = DAG.getNode(ISD::FMA, SL, MVT::f32, Err, Rcp, Quot);
10633+ Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
10634+ SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp);
10635+ SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10636+ TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10637+ DAG.getConstant(0xff800000, SL, MVT::i32));
10638+ Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10639+ Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot);
10640+ SDValue FPRoundFlag = DAG.getConstant(0, SL, MVT::i32);
10641+ SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10642+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS);
1062210643}
1062310644
1062410645// Faster 2.5 ULP division that does not support denormals.
0 commit comments