@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260260 VF * getNumElements(ScalarTy));
261261}
262262
263+ /// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264+ /// which forms type, which splits by \p TTI into whole vector types during
265+ /// legalization.
266+ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267+ Type *Ty, unsigned Sz) {
268+ if (!isValidElementType(Ty))
269+ return bit_ceil(Sz);
270+ // Find the number of elements, which forms full vectors.
271+ const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
272+ if (NumParts == 0 || NumParts >= Sz)
273+ return bit_ceil(Sz);
274+ return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
275+ }
276+
263277static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
264278 SmallVectorImpl<int> &Mask) {
265279 // The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -394,7 +408,7 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
394408/// total number of elements \p Size and number of registers (parts) \p
395409/// NumParts.
396410static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
397- return PowerOf2Ceil( divideCeil(Size, NumParts));
411+ return std::min<unsigned>(Size, bit_ceil( divideCeil(Size, NumParts) ));
398412}
399413
400414/// Returns correct remaining number of elements, considering total amount \p
@@ -1222,6 +1236,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
12221236 (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
12231237}
12241238
1239+ /// Returns true if widened type of \p Ty elements with size \p Sz represents
1240+ /// full vector type, i.e. adding extra element results in extra parts upon type
1241+ /// legalization.
1242+ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1243+ unsigned Sz) {
1244+ if (Sz <= 1)
1245+ return false;
1246+ if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1247+ return false;
1248+ if (has_single_bit(Sz))
1249+ return true;
1250+ const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1251+ return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1252+ Sz % NumParts == 0;
1253+ }
1254+
12251255namespace slpvectorizer {
12261256
12271257/// Bottom Up SLP Vectorizer.
@@ -3311,6 +3341,15 @@ class BoUpSLP {
33113341 /// Return true if this is a non-power-of-2 node.
33123342 bool isNonPowOf2Vec() const {
33133343 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3344+ return IsNonPowerOf2;
3345+ }
3346+
3347+ /// Return true if this is a node, which tries to vectorize number of
3348+ /// elements, forming whole vectors.
3349+ bool
3350+ hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3351+ bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3352+ TTI, getValueType(Scalars.front()), Scalars.size());
33143353 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
33153354 "Reshuffling not supported with non-power-of-2 vectors yet.");
33163355 return IsNonPowerOf2;
@@ -3430,8 +3469,10 @@ class BoUpSLP {
34303469 Last->State = EntryState;
34313470 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
34323471 // for non-power-of-two vectors.
3433- assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) &&
3434- "Reshuffling scalars not yet supported for nodes with padding");
3472+ assert(
3473+ (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
3474+ ReuseShuffleIndices.empty()) &&
3475+ "Reshuffling scalars not yet supported for nodes with padding");
34353476 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
34363477 ReuseShuffleIndices.end());
34373478 if (ReorderIndices.empty()) {
@@ -4412,7 +4453,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
44124453 return std::nullopt;
44134454 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
44144455 int NumParts = TTI->getNumberOfParts(VecTy);
4415- if (NumParts == 0 || NumParts >= NumScalars)
4456+ if (NumParts == 0 || NumParts >= NumScalars ||
4457+ VecTy->getNumElements() % NumParts != 0)
44164458 NumParts = 1;
44174459 SmallVector<int> ExtractMask;
44184460 SmallVector<int> Mask;
@@ -5269,7 +5311,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
52695311 // node.
52705312 if (!TE.ReuseShuffleIndices.empty()) {
52715313 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5272- assert(!TE.isNonPowOf2Vec( ) &&
5314+ assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI ) &&
52735315 "Reshuffling scalars not yet supported for nodes with padding");
52745316
52755317 if (isSplat(TE.Scalars))
@@ -5509,7 +5551,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
55095551 }
55105552 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
55115553 // has been auditted for correctness with non-power-of-two vectors.
5512- if (!TE.isNonPowOf2Vec( ))
5554+ if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI ))
55135555 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
55145556 return CurrentOrder;
55155557 }
@@ -5662,15 +5704,18 @@ void BoUpSLP::reorderTopToBottom() {
56625704 });
56635705
56645706 // Reorder the graph nodes according to their vectorization factor.
5665- for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5666- VF = bit_ceil (VF) / 2 ) {
5707+ for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5708+ !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U) ) {
56675709 auto It = VFToOrderedEntries.find(VF);
56685710 if (It == VFToOrderedEntries.end())
56695711 continue;
56705712 // Try to find the most profitable order. We just are looking for the most
56715713 // used order and reorder scalar elements in the nodes according to this
56725714 // mostly used order.
56735715 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5716+ // Delete VF entry upon exit.
5717+ auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5718+
56745719 // All operands are reordered and used only in this node - propagate the
56755720 // most used order to the user node.
56765721 MapVector<OrdersType, unsigned,
@@ -6413,7 +6458,8 @@ static void gatherPossiblyVectorizableLoads(
64136458 if (NumScalars > 1) {
64146459 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
64156460 NumParts = TTI.getNumberOfParts(VecTy);
6416- if (NumParts == 0 || NumParts >= NumScalars)
6461+ if (NumParts == 0 || NumParts >= NumScalars ||
6462+ VecTy->getNumElements() % NumParts != 0)
64176463 NumParts = 1;
64186464 }
64196465 unsigned VF = PowerOf2Ceil(NumScalars / NumParts);
@@ -7529,33 +7575,36 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
75297575 UniqueValues.emplace_back(V);
75307576 }
75317577 size_t NumUniqueScalarValues = UniqueValues.size();
7532- if (NumUniqueScalarValues == VL.size()) {
7578+ bool IsFullVectors = hasFullVectorsOrPowerOf2(
7579+ *TTI, UniqueValues.front()->getType(), NumUniqueScalarValues);
7580+ if (NumUniqueScalarValues == VL.size() &&
7581+ (VectorizeNonPowerOf2 || IsFullVectors)) {
75337582 ReuseShuffleIndices.clear();
75347583 } else {
75357584 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
7536- if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) ||
7537- !llvm::has_single_bit(VL.size())) {
7585+ if ((UserTreeIdx.UserTE &&
7586+ UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
7587+ !has_single_bit(VL.size())) {
75387588 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
75397589 "for nodes with padding.\n");
75407590 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
75417591 return false;
75427592 }
75437593 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
7544- if (NumUniqueScalarValues <= 1 ||
7545- (UniquePositions.size() == 1 && all_of(UniqueValues,
7546- [](Value *V) {
7547- return isa<UndefValue>(V) ||
7548- !isConstant(V);
7549- })) ||
7550- !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
7594+ if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
7595+ (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
7596+ return isa<UndefValue>(V) || !isConstant(V);
7597+ }))) {
75517598 if (DoNotFail && UniquePositions.size() > 1 &&
75527599 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
75537600 all_of(UniqueValues, [=](Value *V) {
75547601 return isa<ExtractElementInst>(V) ||
75557602 areAllUsersVectorized(cast<Instruction>(V),
75567603 UserIgnoreList);
75577604 })) {
7558- unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
7605+ // Find the number of elements, which forms full vectors.
7606+ unsigned PWSz = getFullVectorNumberOfElements(
7607+ *TTI, UniqueValues.front()->getType(), UniqueValues.size());
75597608 if (PWSz == VL.size()) {
75607609 ReuseShuffleIndices.clear();
75617610 } else {
@@ -9793,9 +9842,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
97939842 return nullptr;
97949843 Value *VecBase = nullptr;
97959844 ArrayRef<Value *> VL = E->Scalars;
9796- // If the resulting type is scalarized, do not adjust the cost.
9797- if (NumParts == VL.size())
9798- return nullptr;
97999845 // Check if it can be considered reused if same extractelements were
98009846 // vectorized already.
98019847 bool PrevNodeFound = any_of(
@@ -9911,7 +9957,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
99119957 assert(!CommonMask.empty() && "Expected non-empty common mask.");
99129958 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
99139959 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9914- if (NumParts == 0 || NumParts >= Mask.size())
9960+ if (NumParts == 0 || NumParts >= Mask.size() ||
9961+ MaskVecTy->getNumElements() % NumParts != 0)
99159962 NumParts = 1;
99169963 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
99179964 const auto *It =
@@ -9928,7 +9975,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
99289975 assert(!CommonMask.empty() && "Expected non-empty common mask.");
99299976 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
99309977 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9931- if (NumParts == 0 || NumParts >= Mask.size())
9978+ if (NumParts == 0 || NumParts >= Mask.size() ||
9979+ MaskVecTy->getNumElements() % NumParts != 0)
99329980 NumParts = 1;
99339981 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
99349982 const auto *It =
@@ -10450,7 +10498,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1045010498 InsertMask[Idx] = I + 1;
1045110499 }
1045210500 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
10453- if (NumOfParts > 0)
10501+ if (NumOfParts > 0 && NumOfParts < NumElts )
1045410502 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
1045510503 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
1045610504 VecScalarsSz;
@@ -13579,7 +13627,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1357913627 Type *OrigScalarTy = GatheredScalars.front()->getType();
1358013628 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
1358113629 unsigned NumParts = TTI->getNumberOfParts(VecTy);
13582- if (NumParts == 0 || NumParts >= GatheredScalars.size())
13630+ if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
13631+ VecTy->getNumElements() % NumParts != 0)
1358313632 NumParts = 1;
1358413633 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
1358513634 // Check for gathered extracts.
@@ -17785,7 +17834,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1778517834 for (unsigned I = NextInst; I < MaxInst; ++I) {
1778617835 unsigned ActualVF = std::min(MaxInst - I, VF);
1778717836
17788- if (!has_single_bit( ActualVF))
17837+ if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
1778917838 continue;
1779017839
1779117840 if (MaxVFOnly && ActualVF < MaxVF)
0 commit comments