AMDGPU: Make v2f64 -> v2f16 conversion Legal only when unsafe fast math is set

changpeng · changpeng · commit 62078af2d2ea · 2025-04-11T13:47:03.000-07:00
Custom lowering v2f64 -&gt; v2f16.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1054,9 +1054,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
   if (ST.hasCvtPkF16F32Inst()) {
-    FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}});
-    if (TM.Options.UnsafeFPMath)
-      FPTruncActions.legalFor({V2S16, V2S64});
+    FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
+                  .customFor({V2S16, V2S64});
   } else
     FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
   FPTruncActions.scalarize(0).lower();
@@ -2156,6 +2155,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
   case TargetOpcode::G_FMINNUM_IEEE:
   case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
+  case TargetOpcode::G_FPTRUNC:
+    return legalizeFPTrunc(Helper, MI, MRI);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, B);
   case TargetOpcode::G_INSERT_VECTOR_ELT:
@@ -2742,6 +2743,29 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
 }
 
+bool AMDGPULegalizerInfo::legalizeFPTrunc(LegalizerHelper &Helper,
+                                          MachineInstr &MI,
+                                          MachineRegisterInfo &MRI) const {
+  // TODO: We should only use fast math flag. But the global option is
+  // still used here to be consistent, especially when the fast math flag is
+  // not working for FP_ROUND on the SelectDAG path at this moment.
+  MachineFunction &MF = Helper.MIRBuilder.getMF();
+  bool AllowInaccurateFPTRUNC = MI.getFlag(MachineInstr::FmAfn) ||
+                                MF.getTarget().Options.UnsafeFPMath;
+
+  if (AllowInaccurateFPTRUNC) {
+    // Use the tablegen pattern to select native instructions.
+    return true;
+  }
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  // Scalarize the vector and fall through to lower f64 -> f16.
+  return Helper.fewerElementsVector(MI, 0, DstTy.getElementType()) ==
+         LegalizerHelper::Legalized;
+}
+
 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -56,6 +56,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI,
                      MachineIRBuilder &B, bool Signed) const;
   bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const;
+  bool legalizeFPTrunc(LegalizerHelper &Helper, MachineInstr &MI,
+                       MachineRegisterInfo &MRI) const;
   bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
                                 MachineIRBuilder &B) const;
   bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6893,8 +6893,18 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     return Op;
 
   EVT DstVT = Op.getValueType();
-  if (DstVT == MVT::v2f16)
-    return DAG.getTarget().Options.UnsafeFPMath ? Op : SDValue();
+
+  if (DstVT == MVT::v2f16) {
+    // FIXME: We should only use fast math flag here. However, the fast math
+    // flag is lost during fptrunc to fp_round lowering. In addition, the flag
+    // is not propagated during subsequent lowering.
+    bool AllowInaccurateFP_ROUND = Op->getFlags().hasApproximateFuncs() ||
+                                   DAG.getTarget().Options.UnsafeFPMath;
+    // With fast math, the tablegen pattern is used to select native
+    // instructions. Otherwise, the vector will be scalarized and custom lowered
+    // to preserve the precision.
+    return AllowInaccurateFP_ROUND ? Op : SDValue();
+  }
 
   SDLoc DL(Op);
   if (DstVT == MVT::f16) {