@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260260 VF * getNumElements(ScalarTy));
261261}
262262
263+ /// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264+ /// which forms type, which splits by \p TTI into whole vector types during
265+ /// legalization.
266+ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267+ Type *Ty, unsigned Sz) {
268+ if (!isValidElementType(Ty))
269+ return PowerOf2Ceil(Sz);
270+ // Find the number of elements, which forms full vectors.
271+ const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
272+ if (NumParts == 0 || NumParts == Sz)
273+ return PowerOf2Ceil(Sz);
274+ return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts;
275+ }
276+
263277static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
264278 SmallVectorImpl<int> &Mask) {
265279 // The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -1224,6 +1238,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
12241238 (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
12251239}
12261240
1241+ /// Returns true if widened type of \p Ty elements with size \p Sz represents
1242+ /// full vector type, i.e. adding extra element results in extra parts upon type
1243+ /// legalization.
1244+ static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
1245+ unsigned Sz) {
1246+ if (Sz <= 1)
1247+ return false;
1248+ if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1249+ return false;
1250+ if (has_single_bit(Sz))
1251+ return true;
1252+ const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
1253+ return NumParts > 0 && NumParts != Sz && has_single_bit(Sz / NumParts) &&
1254+ Sz % NumParts == 0;
1255+ }
1256+
12271257namespace slpvectorizer {
12281258
12291259/// Bottom Up SLP Vectorizer.
@@ -2467,7 +2497,9 @@ class BoUpSLP {
24672497 }
24682498 // TODO: Check if we can remove a check for non-power-2 number of
24692499 // scalars after full support of non-power-2 vectorization.
2470- return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2500+ return UniqueValues.size() != 2 &&
2501+ hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
2502+ UniqueValues.size());
24712503 };
24722504
24732505 // If the initial strategy fails for any of the operand indexes, then we
@@ -3276,8 +3308,9 @@ class BoUpSLP {
32763308 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
32773309
32783310 /// Return true if this is a non-power-of-2 node.
3279- bool isNonPowOf2Vec() const {
3280- bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3311+ bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
3312+ bool IsNonPowerOf2 = !hasFullVectorsOnly(
3313+ TTI, getValueType(Scalars.front()), Scalars.size());
32813314 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
32823315 "Reshuffling not supported with non-power-of-2 vectors yet.");
32833316 return IsNonPowerOf2;
@@ -3455,7 +3488,7 @@ class BoUpSLP {
34553488
34563489 if (UserTreeIdx.UserTE) {
34573490 Last->UserTreeIndices.push_back(UserTreeIdx);
3458- assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3491+ assert((!Last->isNonPowOf2Vec(*TTI ) || Last->ReorderIndices.empty()) &&
34593492 "Reordering isn't implemented for non-power-of-2 nodes yet");
34603493 }
34613494 return Last;
@@ -4361,7 +4394,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
43614394 if (!isValidElementType(ScalarTy))
43624395 return std::nullopt;
43634396 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4364- int NumParts = TTI->getNumberOfParts (VecTy);
4397+ int NumParts = TTI->getRegUsageForType (VecTy);
43654398 if (NumParts == 0 || NumParts >= NumScalars)
43664399 NumParts = 1;
43674400 SmallVector<int> ExtractMask;
@@ -4733,7 +4766,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
47334766 // Check the order of pointer operands or that all pointers are the same.
47344767 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
47354768 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4736- if (!Order.empty() && !has_single_bit(VL.size() )) {
4769+ if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz )) {
47374770 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
47384771 "supported with VectorizeNonPowerOf2");
47394772 return LoadsState::Gather;
@@ -4787,12 +4820,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
47874820 });
47884821 });
47894822 const unsigned AbsoluteDiff = std::abs(*Diff);
4790- if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4791- ((Sz > MinProfitableStridedLoads ||
4792- (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4793- has_single_bit(AbsoluteDiff))) &&
4794- AbsoluteDiff > Sz) ||
4795- *Diff == -(static_cast<int>(Sz) - 1))) {
4823+ if (IsPossibleStrided &&
4824+ (IsAnyPointerUsedOutGraph ||
4825+ ((Sz > MinProfitableStridedLoads ||
4826+ (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4827+ hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
4828+ AbsoluteDiff > Sz) ||
4829+ *Diff == -(static_cast<int>(Sz) - 1))) {
47964830 int Stride = *Diff / static_cast<int>(Sz - 1);
47974831 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
47984832 Align Alignment =
@@ -5197,7 +5231,7 @@ static bool areTwoInsertFromSameBuildVector(
51975231std::optional<BoUpSLP::OrdersType>
51985232BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
51995233 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
5200- if (TE.isNonPowOf2Vec())
5234+ if (TE.isNonPowOf2Vec(*TTI ))
52015235 return std::nullopt;
52025236
52035237 // No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5231,8 +5265,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
52315265 }
52325266 }
52335267 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5234- TTI->getNumberOfParts (getWidenedType(TE.Scalars.front()->getType(),
5235- 2 * TE.getVectorFactor())) == 1)
5268+ TTI->getRegUsageForType (getWidenedType(TE.Scalars.front()->getType(),
5269+ 2 * TE.getVectorFactor())) == 1)
52365270 return std::nullopt;
52375271 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
52385272 Sz)) {
@@ -5581,7 +5615,7 @@ void BoUpSLP::reorderTopToBottom() {
55815615
55825616 // Reorder the graph nodes according to their vectorization factor.
55835617 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5584- VF / = 2) {
5618+ VF - = 2) {
55855619 auto It = VFToOrderedEntries.find(VF);
55865620 if (It == VFToOrderedEntries.end())
55875621 continue;
@@ -5754,7 +5788,7 @@ bool BoUpSLP::canReorderOperands(
57545788 ArrayRef<TreeEntry *> ReorderableGathers,
57555789 SmallVectorImpl<TreeEntry *> &GatherOps) {
57565790 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5757- if (UserTE->isNonPowOf2Vec())
5791+ if (UserTE->isNonPowOf2Vec(*TTI ))
57585792 return false;
57595793
57605794 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5929,7 +5963,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
59295963 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
59305964 const auto AllowsReordering = [&](const TreeEntry *TE) {
59315965 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5932- if (TE->isNonPowOf2Vec())
5966+ if (TE->isNonPowOf2Vec(*TTI ))
59335967 return false;
59345968 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
59355969 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6575,7 +6609,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
65756609 case Instruction::ExtractElement: {
65766610 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
65776611 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6578- if (!has_single_bit( VL.size()))
6612+ if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
65796613 return TreeEntry::NeedToGather;
65806614 if (Reuse || !CurrentOrder.empty())
65816615 return TreeEntry::Vectorize;
@@ -6985,7 +7019,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
69857019 ReuseShuffleIndices.clear();
69867020 } else {
69877021 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6988- if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
7022+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI )) {
69897023 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
69907024 "for nodes with padding.\n");
69917025 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -6998,15 +7032,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
69987032 return isa<UndefValue>(V) ||
69997033 !isConstant(V);
70007034 })) ||
7001- !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
7035+ !hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
7036+ NumUniqueScalarValues)) {
70027037 if (DoNotFail && UniquePositions.size() > 1 &&
70037038 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
70047039 all_of(UniqueValues, [=](Value *V) {
70057040 return isa<ExtractElementInst>(V) ||
70067041 areAllUsersVectorized(cast<Instruction>(V),
70077042 UserIgnoreList);
70087043 })) {
7009- unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
7044+ // Find the number of elements, which forms full vectors.
7045+ unsigned PWSz = getFullVectorNumberOfElements(
7046+ *TTI, UniqueValues.front()->getType(), UniqueValues.size());
70107047 if (PWSz == VL.size()) {
70117048 ReuseShuffleIndices.clear();
70127049 } else {
@@ -9217,7 +9254,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
92179254 }
92189255 assert(!CommonMask.empty() && "Expected non-empty common mask.");
92199256 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9220- unsigned NumParts = TTI.getNumberOfParts (MaskVecTy);
9257+ unsigned NumParts = TTI.getRegUsageForType (MaskVecTy);
92219258 if (NumParts == 0 || NumParts >= Mask.size())
92229259 NumParts = 1;
92239260 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9234,7 +9271,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
92349271 }
92359272 assert(!CommonMask.empty() && "Expected non-empty common mask.");
92369273 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9237- unsigned NumParts = TTI.getNumberOfParts (MaskVecTy);
9274+ unsigned NumParts = TTI.getRegUsageForType (MaskVecTy);
92389275 if (NumParts == 0 || NumParts >= Mask.size())
92399276 NumParts = 1;
92409277 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9740,7 +9777,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
97409777 unsigned const NumElts = SrcVecTy->getNumElements();
97419778 unsigned const NumScalars = VL.size();
97429779
9743- unsigned NumOfParts = TTI->getNumberOfParts (SrcVecTy);
9780+ unsigned NumOfParts = TTI->getRegUsageForType (SrcVecTy);
97449781
97459782 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
97469783 unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -10956,7 +10993,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1095610993 // Keep original scalar if number of externally used instructions in
1095710994 // the same entry is not power of 2. It may help to do some extra
1095810995 // vectorization for now.
10959- KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
10996+ KeepScalar =
10997+ ScalarUsesCount <= 1 ||
10998+ !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
1096010999 }
1096111000 if (KeepScalar) {
1096211001 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11649,13 +11688,14 @@ BoUpSLP::isGatherShuffledEntry(
1164911688 if (TE == VectorizableTree.front().get())
1165011689 return {};
1165111690 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11652- if (TE->isNonPowOf2Vec())
11691+ if (TE->isNonPowOf2Vec(*TTI ))
1165311692 return {};
1165411693 Mask.assign(VL.size(), PoisonMaskElem);
1165511694 assert(TE->UserTreeIndices.size() == 1 &&
1165611695 "Expected only single user of the gather node.");
11657- assert(VL.size() % NumParts == 0 &&
11658- "Number of scalars must be divisible by NumParts.");
11696+ // Number of scalars must be divisible by NumParts.
11697+ if (VL.size() % NumParts != 0)
11698+ return {};
1165911699 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
1166011700 SmallVector<std::optional<TTI::ShuffleKind>> Res;
1166111701 for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -12794,7 +12834,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1279412834 SmallVector<SmallVector<const TreeEntry *>> Entries;
1279512835 Type *OrigScalarTy = GatheredScalars.front()->getType();
1279612836 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12797- unsigned NumParts = TTI->getNumberOfParts (VecTy);
12837+ unsigned NumParts = TTI->getRegUsageForType (VecTy);
1279812838 if (NumParts == 0 || NumParts >= GatheredScalars.size())
1279912839 NumParts = 1;
1280012840 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
@@ -16040,7 +16080,7 @@ void BoUpSLP::computeMinimumValueSizes() {
1604016080 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
1604116081 return 0u;
1604216082
16043- unsigned NumParts = TTI->getNumberOfParts (
16083+ unsigned NumParts = TTI->getRegUsageForType (
1604416084 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
1604516085
1604616086 // The maximum bit width required to represent all the values that can be
@@ -16097,7 +16137,7 @@ void BoUpSLP::computeMinimumValueSizes() {
1609716137 // use - ignore it.
1609816138 if (NumParts > 1 &&
1609916139 NumParts ==
16100- TTI->getNumberOfParts (getWidenedType(
16140+ TTI->getRegUsageForType (getWidenedType(
1610116141 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
1610216142 return 0u;
1610316143
@@ -16958,7 +16998,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1695816998 for (unsigned I = NextInst; I < MaxInst; ++I) {
1695916999 unsigned ActualVF = std::min(MaxInst - I, VF);
1696017000
16961- if (!has_single_bit( ActualVF))
17001+ if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
1696217002 continue;
1696317003
1696417004 if (MaxVFOnly && ActualVF < MaxVF)
0 commit comments