@@ -1314,6 +1314,22 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
13141314 Sz % NumParts == 0;
13151315}
13161316
1317+ /// Returns number of parts, the type \p VecTy will be split at the codegen
1318+ /// phase. If the type is going to be scalarized or does not uses whole
1319+ /// registers, returns 1.
1320+ static unsigned
1321+ getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1322+ const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1323+ unsigned NumParts = TTI.getNumberOfParts(VecTy);
1324+ if (NumParts == 0 || NumParts >= Limit)
1325+ return 1;
1326+ unsigned Sz = getNumElements(VecTy);
1327+ if (NumParts >= Sz || Sz % NumParts != 0 ||
1328+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1329+ return 1;
1330+ return NumParts;
1331+ }
1332+
13171333namespace slpvectorizer {
13181334
13191335/// Bottom Up SLP Vectorizer.
@@ -4618,12 +4634,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
46184634 if (!isValidElementType(ScalarTy))
46194635 return std::nullopt;
46204636 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4621- int NumParts = TTI->getNumberOfParts(VecTy);
4622- if (NumParts == 0 || NumParts >= NumScalars ||
4623- VecTy->getNumElements() % NumParts != 0 ||
4624- !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4625- VecTy->getNumElements() / NumParts))
4626- NumParts = 1;
4637+ unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
46274638 SmallVector<int> ExtractMask;
46284639 SmallVector<int> Mask;
46294640 SmallVector<SmallVector<const TreeEntry *>> Entries;
@@ -5574,8 +5585,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
55745585 }
55755586 }
55765587 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5577- TTI-> getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5578- 2 * TE.getVectorFactor())) == 1)
5588+ :: getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
5589+ 2 * TE.getVectorFactor())) == 1)
55795590 return std::nullopt;
55805591 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
55815592 Sz)) {
@@ -9846,13 +9857,13 @@ void BoUpSLP::transformNodes() {
98469857 // Do not try to vectorize small splats (less than vector register and
98479858 // only with the single non-undef element).
98489859 bool IsSplat = isSplat(Slice);
9849- if (Slices.empty() || !IsSplat ||
9850- ( VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9851- Slice.front()->getType(), VF)),
9852- 1U, VF - 1) !=
9853- std::clamp(TTI->getNumberOfParts(getWidenedType(
9854- Slice.front()->getType(), 2 * VF)),
9855- 1U, 2 * VF)) ||
9860+ bool IsTwoRegisterSplat = true;
9861+ if (IsSplat && VF == 2) {
9862+ unsigned NumRegs2VF = ::getNumberOfParts(
9863+ *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
9864+ IsTwoRegisterSplat = NumRegs2VF == 2;
9865+ }
9866+ if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
98569867 count(Slice, Slice.front()) ==
98579868 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
98589869 : 1)) {
@@ -10793,12 +10804,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1079310804 }
1079410805 assert(!CommonMask.empty() && "Expected non-empty common mask.");
1079510806 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10796- unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10797- if (NumParts == 0 || NumParts >= Mask.size() ||
10798- MaskVecTy->getNumElements() % NumParts != 0 ||
10799- !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10800- MaskVecTy->getNumElements() / NumParts))
10801- NumParts = 1;
10807+ unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
1080210808 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1080310809 const auto *It =
1080410810 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -10813,12 +10819,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1081310819 }
1081410820 assert(!CommonMask.empty() && "Expected non-empty common mask.");
1081510821 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10816- unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10817- if (NumParts == 0 || NumParts >= Mask.size() ||
10818- MaskVecTy->getNumElements() % NumParts != 0 ||
10819- !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10820- MaskVecTy->getNumElements() / NumParts))
10821- NumParts = 1;
10822+ unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
1082210823 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1082310824 const auto *It =
1082410825 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -11351,7 +11352,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1135111352 unsigned const NumElts = SrcVecTy->getNumElements();
1135211353 unsigned const NumScalars = VL.size();
1135311354
11354- unsigned NumOfParts = TTI-> getNumberOfParts(SrcVecTy);
11355+ unsigned NumOfParts = :: getNumberOfParts(*TTI, SrcVecTy);
1135511356
1135611357 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
1135711358 unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -14862,12 +14863,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1486214863 SmallVector<SmallVector<const TreeEntry *>> Entries;
1486314864 Type *OrigScalarTy = GatheredScalars.front()->getType();
1486414865 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14865- unsigned NumParts = TTI->getNumberOfParts(VecTy);
14866- if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14867- VecTy->getNumElements() % NumParts != 0 ||
14868- !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14869- VecTy->getNumElements() / NumParts))
14870- NumParts = 1;
14866+ unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
1487114867 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
1487214868 // Check for gathered extracts.
1487314869 bool Resized = false;
@@ -14899,12 +14895,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1489914895 Resized = true;
1490014896 GatheredScalars.append(VF - GatheredScalars.size(),
1490114897 PoisonValue::get(OrigScalarTy));
14902- NumParts = TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF));
14903- if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14904- VecTy->getNumElements() % NumParts != 0 ||
14905- !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14906- VecTy->getNumElements() / NumParts))
14907- NumParts = 1;
14898+ NumParts =
14899+ ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
1490814900 }
1490914901 }
1491014902 }
@@ -17049,10 +17041,10 @@ void BoUpSLP::optimizeGatherSequence() {
1704917041 // Check if the last undefs actually change the final number of used vector
1705017042 // registers.
1705117043 return SM1.size() - LastUndefsCnt > 1 &&
17052- TTI-> getNumberOfParts(SI1->getType()) ==
17053- TTI-> getNumberOfParts(
17054- getWidenedType(SI1->getType()->getElementType(),
17055- SM1.size() - LastUndefsCnt));
17044+ :: getNumberOfParts(*TTI, SI1->getType()) ==
17045+ :: getNumberOfParts(
17046+ *TTI, getWidenedType(SI1->getType()->getElementType(),
17047+ SM1.size() - LastUndefsCnt));
1705617048 };
1705717049 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
1705817050 // instructions. TODO: We can further optimize this scan if we split the
@@ -17829,9 +17821,12 @@ bool BoUpSLP::collectValuesToDemote(
1782917821 const unsigned VF = E.Scalars.size();
1783017822 Type *OrigScalarTy = E.Scalars.front()->getType();
1783117823 if (UniqueBases.size() <= 2 ||
17832- TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17833- TTI->getNumberOfParts(getWidenedType(
17834- IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17824+ ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) ==
17825+ ::getNumberOfParts(
17826+ *TTI,
17827+ getWidenedType(
17828+ IntegerType::get(OrigScalarTy->getContext(), BitWidth),
17829+ VF)))
1783517830 ToDemote.push_back(E.Idx);
1783617831 }
1783717832 return Res;
@@ -18241,8 +18236,8 @@ void BoUpSLP::computeMinimumValueSizes() {
1824118236 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
1824218237 return 0u;
1824318238
18244- unsigned NumParts = TTI-> getNumberOfParts(
18245- getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18239+ unsigned NumParts = :: getNumberOfParts(
18240+ *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
1824618241
1824718242 // The maximum bit width required to represent all the values that can be
1824818243 // demoted without loss of precision. It would be safe to truncate the roots
@@ -18302,8 +18297,10 @@ void BoUpSLP::computeMinimumValueSizes() {
1830218297 // use - ignore it.
1830318298 if (NumParts > 1 &&
1830418299 NumParts ==
18305- TTI->getNumberOfParts(getWidenedType(
18306- IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18300+ ::getNumberOfParts(
18301+ *TTI, getWidenedType(IntegerType::get(F->getContext(),
18302+ bit_ceil(MaxBitWidth)),
18303+ VF)))
1830718304 return 0u;
1830818305
1830918306 unsigned Opcode = E.getOpcode();
@@ -20086,14 +20083,14 @@ class HorizontalReduction {
2008620083 ReduxWidth =
2008720084 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
2008820085 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20089- NumParts = TTI. getNumberOfParts(Tp);
20086+ NumParts = :: getNumberOfParts(TTI, Tp);
2009020087 NumRegs =
2009120088 TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2009220089 while (NumParts > NumRegs) {
2009320090 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
2009420091 ReduxWidth = bit_floor(ReduxWidth - 1);
2009520092 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20096- NumParts = TTI. getNumberOfParts(Tp);
20093+ NumParts = :: getNumberOfParts(TTI, Tp);
2009720094 NumRegs =
2009820095 TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2009920096 }
0 commit comments