@@ -5014,6 +5014,42 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
50145014 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
50155015}
50165016
5017+ /// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
5018+ /// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
5019+ /// instead of a scalar.
5020+ static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
5021+ Type *ScalarTy, VectorType *Ty,
5022+ const APInt &DemandedElts,
5023+ bool Insert, bool Extract,
5024+ TTI::TargetCostKind CostKind,
5025+ ArrayRef<Value *> VL = {}) {
5026+ assert(!isa<ScalableVectorType>(Ty) &&
5027+ "ScalableVectorType is not supported.");
5028+ assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
5029+ getNumElements(Ty) &&
5030+ "Incorrect usage.");
5031+ if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
5032+ assert(SLPReVec && "Only supported by REVEC.");
5033+ // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
5034+ // of CreateInsertElement.
5035+ unsigned ScalarTyNumElements = VecTy->getNumElements();
5036+ InstructionCost Cost = 0;
5037+ for (unsigned I : seq(DemandedElts.getBitWidth())) {
5038+ if (!DemandedElts[I])
5039+ continue;
5040+ if (Insert)
5041+ Cost += getShuffleCost(TTI, TTI::SK_InsertSubvector, Ty, {}, CostKind,
5042+ I * ScalarTyNumElements, VecTy);
5043+ if (Extract)
5044+ Cost += getShuffleCost(TTI, TTI::SK_ExtractSubvector, Ty, {}, CostKind,
5045+ I * ScalarTyNumElements, VecTy);
5046+ }
5047+ return Cost;
5048+ }
5049+ return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
5050+ CostKind, VL);
5051+ }
5052+
50175053/// Correctly creates insert_subvector, checking that the index is multiple of
50185054/// the subvectors length. Otherwise, generates shuffle using \p Generator or
50195055/// using default shuffle.
@@ -5207,22 +5243,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
52075243 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
52085244 // Estimate the cost of masked gather GEP. If not a splat, roughly
52095245 // estimate as a buildvector, otherwise estimate as splat.
5210- APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5211- VectorType *PtrVecTy =
5212- getWidenedType(PointerOps.front()->getType()->getScalarType(),
5213- VecTy->getNumElements());
5246+ APInt DemandedElts = APInt::getAllOnes(Sz);
5247+ Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
5248+ VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
52145249 if (static_cast<unsigned>(count_if(
52155250 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
52165251 any_of(PointerOps, [&](Value *V) {
52175252 return getUnderlyingObject(V) !=
52185253 getUnderlyingObject(PointerOps.front());
52195254 }))
5220- VectorGEPCost += TTI.getScalarizationOverhead(
5221- PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5255+ VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
5256+ DemandedElts, /*Insert=*/true,
5257+ /*Extract=*/false, CostKind);
52225258 else
52235259 VectorGEPCost +=
5224- TTI. getScalarizationOverhead(
5225- PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements() , 0),
5260+ getScalarizationOverhead(
5261+ TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz , 0),
52265262 /*Insert=*/true, /*Extract=*/false, CostKind) +
52275263 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
52285264 // The cost of scalar loads.
@@ -5240,8 +5276,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
52405276 /*VariableMask=*/false, CommonAlignment, CostKind) +
52415277 (ProfitableGatherPointers ? 0 : VectorGEPCost);
52425278 InstructionCost GatherCost =
5243- TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5244- /*Extract=*/false, CostKind) +
5279+ getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
5280+ /*Insert=*/true,
5281+ /*Extract=*/false, CostKind) +
52455282 ScalarLoadsCost;
52465283 // The list of loads is small or perform partial check already - directly
52475284 // compare masked gather cost and gather cost.
@@ -5294,16 +5331,15 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
52945331 // Can be vectorized later as a serie of loads/insertelements.
52955332 InstructionCost VecLdCost = 0;
52965333 if (!DemandedElts.isZero()) {
5297- VecLdCost =
5298- TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5299- /*Extract=*/false, CostKind) +
5300- ScalarGEPCost;
5334+ VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
5335+ /*Insert=*/true,
5336+ /*Extract=*/false, CostKind) +
5337+ ScalarGEPCost;
53015338 for (unsigned Idx : seq<unsigned>(VL.size()))
53025339 if (DemandedElts[Idx])
53035340 VecLdCost +=
53045341 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
53055342 }
5306- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
53075343 auto *SubVecTy = getWidenedType(ScalarTy, VF);
53085344 for (auto [I, LS] : enumerate(States)) {
53095345 auto *LI0 = cast<LoadInst>(VL[I * VF]);
@@ -5323,13 +5359,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
53235359 return getUnderlyingObject(V) !=
53245360 getUnderlyingObject(PointerOps.front());
53255361 }))
5326- VectorGEPCost += TTI. getScalarizationOverhead(
5327- SubVecTy, APInt::getAllOnes(VF),
5362+ VectorGEPCost += getScalarizationOverhead(
5363+ TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
53285364 /*Insert=*/true, /*Extract=*/false, CostKind);
53295365 else
53305366 VectorGEPCost +=
5331- TTI. getScalarizationOverhead(
5332- SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5367+ getScalarizationOverhead(
5368+ TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
53335369 /*Insert=*/true, /*Extract=*/false, CostKind) +
53345370 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
53355371 CostKind);
@@ -9912,20 +9948,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
99129948 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
99139949 Idx, getWidenedType(ScalarTy, Sz));
99149950 }
9915- if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9916- assert(SLPReVec && "Only supported by REVEC.");
9917- // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9918- // of CreateInsertElement.
9919- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9920- for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9921- if (DemandedElts[I])
9922- Cost +=
9923- TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9924- CostKind, I * ScalarTyNumElements, FTy);
9925- } else {
9926- Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9927- /*Extract=*/false, CostKind);
9928- }
9951+ Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
9952+ /*Insert=*/true,
9953+ /*Extract=*/false, CostKind);
99299954 int Sz = TE.Scalars.size();
99309955 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
99319956 TE.ReorderIndices.end());
@@ -9942,7 +9967,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
99429967 ? TTI::SK_PermuteTwoSrc
99439968 : TTI::SK_PermuteSingleSrc,
99449969 VecTy, ReorderMask);
9945- DemandedElts = APInt::getAllOnes(VecTy->getNumElements ());
9970+ DemandedElts = APInt::getAllOnes(TE.Scalars.size ());
99469971 ReorderMask.assign(Sz, PoisonMaskElem);
99479972 for (unsigned I : seq<unsigned>(Sz)) {
99489973 Value *V = TE.getOrdered(I);
@@ -9954,8 +9979,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
99549979 ReorderMask[I] = I + Sz;
99559980 }
99569981 }
9957- InstructionCost BVCost = TTI->getScalarizationOverhead(
9958- VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9982+ InstructionCost BVCost =
9983+ getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
9984+ /*Insert=*/true, /*Extract=*/false, CostKind);
99599985 if (!DemandedElts.isAllOnes())
99609986 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
99619987 if (Cost >= BVCost) {
@@ -11603,9 +11629,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1160311629 assert(Offset < NumElts && "Failed to find vector index offset");
1160411630
1160511631 InstructionCost Cost = 0;
11606- Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11607- /*Insert*/ true, /*Extract*/ false ,
11608- CostKind);
11632+ Cost -=
11633+ getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts ,
11634+ /*Insert*/ true, /*Extract*/ false, CostKind);
1160911635
1161011636 // First cost - resize to actual vector size if not identity shuffle or
1161111637 // need to shift the vector.
@@ -13780,8 +13806,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1378013806 }
1378113807 if (!IsIdentity)
1378213808 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13783- FirstShuffleCost += TTI-> getScalarizationOverhead(
13784- MaskVecTy, DemandedElts, /*Insert=*/true,
13809+ FirstShuffleCost += getScalarizationOverhead(
13810+ *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
1378513811 /*Extract=*/false, CostKind);
1378613812 }
1378713813 InstructionCost SecondShuffleCost = 0;
@@ -13805,17 +13831,17 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
1380513831 }
1380613832 if (!IsIdentity)
1380713833 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13808- SecondShuffleCost += TTI-> getScalarizationOverhead(
13809- MaskVecTy, DemandedElts, /*Insert=*/true,
13834+ SecondShuffleCost += getScalarizationOverhead(
13835+ *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
1381013836 /*Extract=*/false, CostKind);
1381113837 }
1381213838 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
1381313839 for (auto [I, Idx] : enumerate(SubMask))
1381413840 if (Idx == PoisonMaskElem)
1381513841 DemandedElts.clearBit(I);
13816- InstructionCost BuildVectorCost =
13817- TTI->getScalarizationOverhead( MaskVecTy, DemandedElts, /*Insert=*/true,
13818- /*Extract=*/false, CostKind);
13842+ InstructionCost BuildVectorCost = getScalarizationOverhead(
13843+ * TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
13844+ /*Extract=*/false, CostKind);
1381913845 const TreeEntry *BestEntry = nullptr;
1382013846 if (FirstShuffleCost < ShuffleCost) {
1382113847 std::for_each(std::next(Mask.begin(), Part * VL.size()),
@@ -13968,45 +13994,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1396813994 ShuffledElements.setBit(I);
1396913995 ShuffleMask[I] = Res.first->second;
1397013996 }
13971- if (!DemandedElements.isZero()) {
13972- if (isa<FixedVectorType>(ScalarTy)) {
13973- assert(SLPReVec && "Only supported by REVEC.");
13974- // We don't need to insert elements one by one. Instead, we can insert the
13975- // entire vector into the destination.
13976- Cost = 0;
13977- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13978- for (unsigned I : seq<unsigned>(VL.size()))
13979- if (DemandedElements[I])
13980- Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {},
13981- CostKind, I * ScalarTyNumElements,
13982- cast<FixedVectorType>(ScalarTy));
13983- } else {
13984- Cost += TTI->getScalarizationOverhead(VecTy, DemandedElements,
13985- /*Insert=*/true,
13986- /*Extract=*/false, CostKind, VL);
13987- }
13988- }
13989- if (ForPoisonSrc) {
13990- if (isa<FixedVectorType>(ScalarTy)) {
13991- assert(SLPReVec && "Only supported by REVEC.");
13992- // We don't need to insert elements one by one. Instead, we can insert the
13993- // entire vector into the destination.
13994- assert(DemandedElements.isZero() &&
13995- "Need to consider the cost from DemandedElements.");
13996- Cost = 0;
13997- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13998- for (unsigned I : seq<unsigned>(VL.size()))
13999- if (!ShuffledElements[I])
14000- Cost += TTI->getShuffleCost(
14001- TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
14002- I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
14003- } else {
14004- Cost = TTI->getScalarizationOverhead(VecTy,
14005- /*DemandedElts*/ ~ShuffledElements,
14006- /*Insert*/ true,
14007- /*Extract*/ false, CostKind, VL);
14008- }
14009- }
13997+ if (!DemandedElements.isZero())
13998+ Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
13999+ /*Insert=*/true,
14000+ /*Extract=*/false, CostKind, VL);
14001+ if (ForPoisonSrc)
14002+ Cost = getScalarizationOverhead(*TTI, ScalarTy, VecTy,
14003+ /*DemandedElts*/ ~ShuffledElements,
14004+ /*Insert*/ true,
14005+ /*Extract*/ false, CostKind, VL);
1401014006 if (DuplicateNonConst)
1401114007 Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
1401214008 VecTy, ShuffleMask);
0 commit comments