Override X86TTIImpl::getStoreMinimumVF instead of tweaking codegen tables.

MatzeB · MatzeB · commit 84a22ef1a5ce · 2024-10-24T16:20:53.000-07:00
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1714,9 +1714,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
     }
-    // trunc+store via vcvtps2ph
-    setOperationAction(ISD::STORE, MVT::v4f16, Custom);
-    setOperationAction(ISD::STORE, MVT::v8f16, Custom);
   }
 
   // This block controls legalization of the mask vector sizes that are
@@ -1787,9 +1784,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
-    // trunc+store via vcvtps2ph
-    setOperationAction(ISD::STORE, MVT::v16f16, Custom);
   }
   if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
     for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2977,10 +2977,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
-    { ISD::FP_ROUND,  MVT::v8f16,   MVT::v8f32,   { 1, 1, 1, 1 } }, // vcvtps2ph
-    { ISD::FP_ROUND,  MVT::v4f16,   MVT::v4f32,   { 1, 1, 1, 1 } }, // vcvtps2ph
-    { ISD::FP_EXTEND, MVT::v8f32,   MVT::v8f16,   { 1, 1, 1, 1 } }, // vcvtph2ps
-    { ISD::FP_EXTEND, MVT::v4f32,   MVT::v4f16,   { 1, 1, 1, 1 } }, // vcvtph2ps
+    { ISD::FP_ROUND,  MVT::f16,     MVT::f32,     { 1, 1, 1, 1 } },
+    { ISD::FP_ROUND,  MVT::v8f16,   MVT::v8f32,   { 1, 1, 1, 1 } },
+    { ISD::FP_ROUND,  MVT::v4f16,   MVT::v4f32,   { 1, 1, 1, 1 } },
+    { ISD::FP_EXTEND, MVT::f32,     MVT::f16,     { 1, 1, 1, 1 } },
+    { ISD::FP_EXTEND, MVT::f64,     MVT::f16,     { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
+    { ISD::FP_EXTEND, MVT::v8f32,   MVT::v8f16,   { 1, 1, 1, 1 } },
+    { ISD::FP_EXTEND, MVT::v4f32,   MVT::v4f16,   { 1, 1, 1, 1 } },
     { ISD::FP_EXTEND, MVT::v4f64,   MVT::v4f16,   { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
   };
 
@@ -3171,6 +3174,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                             TTI::CastContextHint::None, CostKind);
   }
 
+  if (ISD == ISD::FP_ROUND && LTDest.second.getScalarType() == MVT::f16) {
+    // Conversion requires a libcall.
+    return InstructionCost::getInvalid();
+  }
+
   // TODO: Allow non-throughput costs that aren't binary.
   auto AdjustCost = [&CostKind](InstructionCost Cost,
                                 InstructionCost N = 1) -> InstructionCost {
@@ -6948,6 +6956,14 @@ bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const {
   return true;
 }
 
+unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                                       Type *ScalarValTy) const {
+  if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
+    return 4;
+  }
+  return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
+}
+
 bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
                                             SmallVectorImpl<Use *> &Ops) const {
   using namespace llvm::PatternMatch;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -302,6 +302,9 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
 
   bool isVectorShiftByScalarCheap(Type *Ty) const;
 
+  unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                             Type *ScalarValTy) const;
+
 private:
   bool supportsGather() const;
   InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll
@@ -123,8 +123,8 @@ define void @fpext_v4xf16_v4xf64(ptr %s0, ptr %d0) {
   ret void
 }
 
-define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
-; CHECK-LABEL: define void @fpext_v16xf15_v16xf32(
+define void @fpext_v16xf16_v16xf32(ptr %s0, ptr %d0) {
+; CHECK-LABEL: define void @fpext_v16xf16_v16xf32(
 ; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1
 ; CHECK-NEXT:    [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2
@@ -206,7 +206,7 @@ define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
 ; CHECK-NEXT:    store float [[E15]], ptr [[D16]], align 8
 ; CHECK-NEXT:    ret void
 ;
-; CHECK-F16C-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-F16C-LABEL: define void @fpext_v16xf16_v16xf32(
 ; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
 ; CHECK-F16C-NEXT:    [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8
 ; CHECK-F16C-NEXT:    [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8
@@ -218,7 +218,7 @@ define void @fpext_v16xf15_v16xf32(ptr %s0, ptr %d0) {
 ; CHECK-F16C-NEXT:    store <8 x float> [[TMP4]], ptr [[D8]], align 8
 ; CHECK-F16C-NEXT:    ret void
 ;
-; CHECK-AVX512-LABEL: define void @fpext_v16xf15_v16xf32(
+; CHECK-AVX512-LABEL: define void @fpext_v16xf16_v16xf32(
 ; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
 ; CHECK-AVX512-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[S0]], align 2
 ; CHECK-AVX512-NEXT:    [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x float>
@@ -453,9 +453,14 @@ define void @fpround_v16xf32_v16xf16(ptr %s0, ptr %d0) {
 ;
 ; CHECK-F16C-LABEL: define void @fpround_v16xf32_v16xf16(
 ; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] {
-; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[S0]], align 4
-; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fptrunc <16 x float> [[TMP1]] to <16 x half>
-; CHECK-F16C-NEXT:    store <16 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT:    [[S8:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 8
+; CHECK-F16C-NEXT:    [[D8:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 8
+; CHECK-F16C-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[S0]], align 4
+; CHECK-F16C-NEXT:    [[TMP2:%.*]] = fptrunc <8 x float> [[TMP1]] to <8 x half>
+; CHECK-F16C-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[S8]], align 4
+; CHECK-F16C-NEXT:    [[TMP4:%.*]] = fptrunc <8 x float> [[TMP3]] to <8 x half>
+; CHECK-F16C-NEXT:    store <8 x half> [[TMP2]], ptr [[D0]], align 2
+; CHECK-F16C-NEXT:    store <8 x half> [[TMP4]], ptr [[D8]], align 2
 ; CHECK-F16C-NEXT:    ret void
 ;
 ; CHECK-AVX512-LABEL: define void @fpround_v16xf32_v16xf16(

Original file line number	Diff line number	Diff line change
`@@ -1714,9 +1714,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,`
`1714`	`1714`	`setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);`
`1715`	`1715`	`setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);`
`1716`	`1716`	`}`
`1717`		`- // trunc+store via vcvtps2ph`
`1718`		`- setOperationAction(ISD::STORE, MVT::v4f16, Custom);`
`1719`		`- setOperationAction(ISD::STORE, MVT::v8f16, Custom);`
`1720`	`1717`	`}`
`1721`	`1718`
`1722`	`1719`	`// This block controls legalization of the mask vector sizes that are`
`@@ -1787,9 +1784,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,`
`1787`	`1784`
`1788`	`1785`	`for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })`
`1789`	`1786`	`setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);`
`1790`		`-`
`1791`		`- // trunc+store via vcvtps2ph`
`1792`		`- setOperationAction(ISD::STORE, MVT::v16f16, Custom);`
`1793`	`1787`	`}`
`1794`	`1788`	`if (Subtarget.hasDQI() && Subtarget.hasVLX()) {`
`1795`	`1789`	`for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {`