AMDGPU: Custom lower vector fptrunc of f32 -> f16

changpeng · changpeng · commit 833b904563c0 · 2025-05-29T15:04:34.000-07:00
GFx950+ supports v_cvt_pk_f16_f32. However current implementation of vector fptrunc lowering fully scalarizes the vector, and the scalar conversions may not always be combined to generate the packed one. We made v2f32 -> v2f16 legal in #139956. This work is an extension to handle wider vectors. Instead of fully scalarization, we split the vector to packs (v2f32 -> v2f16) to ensure the packed conversion can always been generated. NOTE: minor changes
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6906,7 +6906,7 @@ SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
                                                 SelectionDAG &DAG) const {
   EVT DstVT = Op.getValueType();
   unsigned NumElts = DstVT.getVectorNumElements();
-  assert(isPowerOf2_32(NumElts) && "Number of elements must be power of 2");
+  assert(NumElts > 2 && isPowerOf2_32(NumElts));
 
   auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
 
@@ -6930,7 +6930,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
     if (SrcVT.getScalarType() != MVT::f32)
       return SDValue();
-    return DstVT == MVT::v2f16 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
+    return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
   }
 
   if (SrcVT.getScalarType() != MVT::f64)