diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 248a107ded514..90b0ea9e51712 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3088,6 +3088,10 @@ class BoUpSLP { SmallVector findExternalStoreUsersReorderIndices(TreeEntry *TE) const; + /// Tries to reorder the gathering node for better vectorization + /// opportunities. + void reorderGatherNode(TreeEntry &TE); + struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -3393,6 +3397,15 @@ class BoUpSLP { return IsNonPowerOf2; } + Value *getOrdered(unsigned Idx) const { + assert(isGather() && "Must be used only for buildvectors/gathers."); + if (ReorderIndices.empty()) + return Scalars[Idx]; + SmallVector Mask; + inversePermutation(ReorderIndices, Mask); + return Scalars[Mask[Idx]]; + } + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -9340,6 +9353,160 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef Ptrs, return std::make_pair(ScalarCost, VecCost); } +void BoUpSLP::reorderGatherNode(TreeEntry &TE) { + assert(TE.isGather() && TE.ReorderIndices.empty() && + "Expected gather node without reordering."); + DenseMap, SmallVector> LoadsMap; + SmallSet LoadKeyUsed; + + if (any_of(seq(TE.Idx), [&](unsigned Idx) { + return VectorizableTree[Idx]->isSame(TE.Scalars); + })) + return; + + auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { + Key = hash_combine(hash_value(LI->getParent()), Key); + Value *Ptr = + getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth); + if (LoadKeyUsed.contains(Key)) { + auto LIt = LoadsMap.find(std::make_pair(Key, Ptr)); + if (LIt != LoadsMap.end()) { + for (LoadInst *RLI : LIt->second) { + if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(), + LI->getType(), LI->getPointerOperand(), *DL, *SE, + /*StrictCheck=*/true)) + return hash_value(RLI->getPointerOperand()); + } + for (LoadInst *RLI : LIt->second) { + if (arePointersCompatible(RLI->getPointerOperand(), + LI->getPointerOperand(), *TLI)) { + hash_code SubKey = hash_value(RLI->getPointerOperand()); + return SubKey; + } + } + if (LIt->second.size() > 2) { + hash_code SubKey = + hash_value(LIt->second.back()->getPointerOperand()); + return SubKey; + } + } + } + LoadKeyUsed.insert(Key); + LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI); + return hash_value(LI->getPointerOperand()); + }; + MapVector>> SortedValues; + SmallDenseMap, 8> KeyToIndex; + bool IsOrdered = true; + unsigned NumInstructions = 0; + // Try to "cluster" scalar instructions, to be able to build extra vectorized + // nodes. + for (auto [I, V] : enumerate(TE.Scalars)) { + size_t Key = 1, Idx = 1; + if (auto *Inst = dyn_cast(V); + Inst && !isa(V) && + !isDeleted(Inst) && !isVectorized(V)) { + std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey, + /*AllowAlternate=*/false); + ++NumInstructions; + } + auto &Container = SortedValues[Key]; + if (IsOrdered && !KeyToIndex.contains(V) && + !(isa(V) || + isVectorLikeInstWithConstOps(V)) && + ((Container.contains(Idx) && + KeyToIndex.at(Container[Idx].back()).back() != I - 1) || + (!Container.empty() && !Container.contains(Idx) && + KeyToIndex.at(Container.back().second.back()).back() != I - 1))) + IsOrdered = false; + auto &KTI = KeyToIndex[V]; + if (KTI.empty()) + Container[Idx].push_back(V); + KTI.push_back(I); + } + SmallVector> SubVectors; + APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size()); + if (!IsOrdered && NumInstructions > 1) { + unsigned Cnt = 0; + TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size()); + for (const auto &D : SortedValues) { + for (const auto &P : D.second) { + unsigned Sz = 0; + for (Value *V : P.second) { + ArrayRef Indices = KeyToIndex.at(V); + for (auto [K, Idx] : enumerate(Indices)) { + TE.ReorderIndices[Cnt + K] = Idx; + TE.Scalars[Cnt + K] = V; + } + Sz += Indices.size(); + Cnt += Indices.size(); + } + if (Sz > 1 && isa(P.second.front())) { + const unsigned SubVF = getFloorFullVectorNumberOfElements( + *TTI, TE.Scalars.front()->getType(), Sz); + SubVectors.emplace_back(Cnt - Sz, SubVF); + for (unsigned I : seq(Cnt - Sz, Cnt - Sz + SubVF)) + DemandedElts.clearBit(I); + } else if (!P.second.empty() && isConstant(P.second.front())) { + for (unsigned I : seq(Cnt - Sz, Cnt)) + DemandedElts.clearBit(I); + } + } + } + } + // Reuses always require shuffles, so consider it as profitable. + if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty()) + return; + // Do simple cost estimation. + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost Cost = 0; + auto *ScalarTy = TE.Scalars.front()->getType(); + auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size()); + for (auto [Idx, Sz] : SubVectors) { + Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind, + Idx, getWidenedType(ScalarTy, Sz)); + } + Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind); + int Sz = TE.Scalars.size(); + SmallVector ReorderMask(TE.ReorderIndices.begin(), + TE.ReorderIndices.end()); + for (unsigned I : seq(Sz)) { + Value *V = TE.getOrdered(I); + if (isa(V)) { + ReorderMask[I] = PoisonMaskElem; + } else if (isConstant(V) || DemandedElts[I]) { + ReorderMask[I] = I + TE.ReorderIndices.size(); + } + } + Cost += ::getShuffleCost(*TTI, + any_of(ReorderMask, [&](int I) { return I >= Sz; }) + ? TTI::SK_PermuteTwoSrc + : TTI::SK_PermuteSingleSrc, + VecTy, ReorderMask); + DemandedElts = APInt::getAllOnes(VecTy->getNumElements()); + ReorderMask.assign(Sz, PoisonMaskElem); + for (unsigned I : seq(Sz)) { + Value *V = TE.getOrdered(I); + if (isConstant(V)) { + DemandedElts.clearBit(I); + if (!isa(V)) + ReorderMask[I] = I; + } else { + ReorderMask[I] = I + Sz; + } + } + InstructionCost BVCost = TTI->getScalarizationOverhead( + VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind); + if (!DemandedElts.isAllOnes()) + BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask); + if (Cost >= BVCost) { + SmallVector Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end()); + reorderScalars(TE.Scalars, Mask); + TE.ReorderIndices.clear(); + } +} + void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; BaseGraphSize = VectorizableTree.size(); @@ -9377,6 +9544,14 @@ void BoUpSLP::transformNodes() { findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads); }); }; + + // Try to reorder gather nodes for better vectorization opportunities. + for (unsigned Idx : seq(BaseGraphSize)) { + TreeEntry &E = *VectorizableTree[Idx]; + if (E.isGather()) + reorderGatherNode(E); + } + // The tree may grow here, so iterate over nodes, built before. for (unsigned Idx : seq(BaseGraphSize)) { TreeEntry &E = *VectorizableTree[Idx]; @@ -9519,6 +9694,12 @@ void BoUpSLP::transformNodes() { AddCombinedNode(PrevSize, Cnt, Sz); } } + // Restore ordering, if no extra vectorization happened. + if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) { + SmallVector Mask(E.ReorderIndices.begin(), E.ReorderIndices.end()); + reorderScalars(E.Scalars, Mask); + E.ReorderIndices.clear(); + } } switch (E.getOpcode()) { case Instruction::Load: { @@ -10206,7 +10387,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { if (Mask.empty()) return nullptr; Value *VecBase = nullptr; - ArrayRef VL = E->Scalars; + SmallVector VL(E->Scalars.begin(), E->Scalars.end()); + if (!E->ReorderIndices.empty()) { + SmallVector ReorderMask(E->ReorderIndices.begin(), + E->ReorderIndices.end()); + reorderScalars(VL, ReorderMask); + } // Check if it can be considered reused if same extractelements were // vectorized already. bool PrevNodeFound = any_of( @@ -10227,7 +10413,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { for (unsigned Part : seq(NumParts)) { unsigned Limit = getNumElems(VL.size(), SliceSize, Part); ArrayRef SubMask = Mask.slice(Part * SliceSize, Limit); - for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) { + for (auto [I, V] : + enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) { // Ignore non-extractelement scalars. if (isa(V) || (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) @@ -10364,10 +10551,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { [&](auto P) { if (P.value() == PoisonMaskElem) return Mask[P.index()] == PoisonMaskElem; - auto *EI = - cast(InVectors.front() - .get() - ->Scalars[P.index()]); + auto *EI = cast( + InVectors.front().get()->getOrdered( + P.index())); return EI->getVectorOperand() == V1 || EI->getVectorOperand() == V2; }) && @@ -10384,22 +10570,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } if (ForExtracts) { // No need to add vectors here, already handled them in adjustExtracts. - assert(InVectors.size() == 1 && - InVectors.front().is() && !CommonMask.empty() && - all_of(enumerate(CommonMask), - [&](auto P) { - Value *Scalar = InVectors.front() - .get() - ->Scalars[P.index()]; - if (P.value() == PoisonMaskElem) - return P.value() == Mask[P.index()] || - isa(Scalar); - if (isa(V1)) - return true; - auto *EI = cast(Scalar); - return EI->getVectorOperand() == V1; - }) && - "Expected only tree entry for extractelement vectors."); + assert( + InVectors.size() == 1 && InVectors.front().is() && + !CommonMask.empty() && + all_of(enumerate(CommonMask), + [&](auto P) { + Value *Scalar = + InVectors.front().get()->getOrdered( + P.index()); + if (P.value() == PoisonMaskElem) + return P.value() == Mask[P.index()] || + isa(Scalar); + if (isa(V1)) + return true; + auto *EI = cast(Scalar); + return EI->getVectorOperand() == V1; + }) && + "Expected only tree entry for extractelement vectors."); return; } assert(!InVectors.empty() && !CommonMask.empty() && @@ -10470,7 +10657,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost finalize(ArrayRef ExtMask, ArrayRef> SubVectors, - unsigned VF = 0, + ArrayRef SubVectorsMask, unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; if (Action) { @@ -10497,6 +10684,21 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; + // Add subvectors permutation cost. + if (!SubVectorsMask.empty()) { + assert(SubVectorsMask.size() == CommonMask.size() && + "Expected same size of masks for subvectors and common mask."); + SmallVector SVMask(SubVectorsMask.begin(), SubVectorsMask.end()); + for (auto [I1, I2] : zip(SVMask, CommonMask)) { + if (I2 != PoisonMaskElem) { + assert(I1 == PoisonMaskElem && "Expected unused subvectors mask"); + I1 = I2 + CommonMask.size(); + } + } + Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, + getWidenedType(ScalarTy, CommonMask.size()), + SVMask, CostKind); + } for (auto [E, Idx] : SubVectors) { Type *EScalarTy = E->Scalars.front()->getType(); bool IsSigned = true; @@ -13539,11 +13741,17 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { UseVecBaseAsInput = false; SmallPtrSet UniqueBases; Value *VecBase = nullptr; + SmallVector VL(E->Scalars.begin(), E->Scalars.end()); + if (!E->ReorderIndices.empty()) { + SmallVector ReorderMask(E->ReorderIndices.begin(), + E->ReorderIndices.end()); + reorderScalars(VL, ReorderMask); + } for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { int Idx = Mask[I]; if (Idx == PoisonMaskElem) continue; - auto *EI = cast(E->Scalars[I]); + auto *EI = cast(VL[I]); VecBase = EI->getVectorOperand(); if (const TreeEntry *TE = R.getTreeEntry(VecBase)) VecBase = TE->VectorizedValue; @@ -13552,7 +13760,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { // If the only one use is vectorized - can delete the extractelement // itself. if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) || - (NumParts != 1 && count(E->Scalars, EI) > 1) || + (NumParts != 1 && count(VL, EI) > 1) || any_of(EI->users(), [&](User *U) { const TreeEntry *UTE = R.getTreeEntry(U); return !UTE || R.MultiNodeScalars.contains(U) || @@ -13564,7 +13772,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { [&](const EdgeInfo &Edge) { return Edge.UserTE == UTE; }) && - is_contained(TE->Scalars, EI); + is_contained(VL, EI); }) != 1; })) continue; @@ -13586,15 +13794,14 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { // into a long virtual vector register, forming the original vector. Value *Vec = nullptr; SmallVector VecMask(Mask.size(), PoisonMaskElem); - unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts); + unsigned SliceSize = getPartNumElems(VL.size(), NumParts); for (unsigned Part : seq(NumParts)) { - unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part); - ArrayRef VL = - ArrayRef(E->Scalars).slice(Part * SliceSize, Limit); + unsigned Limit = getNumElems(VL.size(), SliceSize, Part); + ArrayRef SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit); MutableArrayRef SubMask = Mask.slice(Part * SliceSize, Limit); constexpr int MaxBases = 2; SmallVector Bases(MaxBases); - auto VLMask = zip(VL, SubMask); + auto VLMask = zip(SubVL, SubMask); const unsigned VF = std::accumulate( VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) { if (std::get<1>(D) == PoisonMaskElem) @@ -13811,7 +14018,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Value * finalize(ArrayRef ExtMask, ArrayRef> SubVectors, - unsigned VF = 0, + ArrayRef SubVectorsMask, unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; unsigned ScalarTyNumElements = getNumElements(ScalarTy); @@ -13855,21 +14062,59 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; - for (auto [E, Idx] : SubVectors) { - Value *V = E->VectorizedValue; - if (V->getType()->isIntOrIntVectorTy()) - V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) { - return !isKnownNonNegative( - V, SimplifyQuery(*R.DL)); - })); - unsigned InsertionIndex = Idx * ScalarTyNumElements; - Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, - Builder.getInt64(InsertionIndex)); - if (!CommonMask.empty()) { - std::iota(std::next(CommonMask.begin(), InsertionIndex), - std::next(CommonMask.begin(), (Idx + E->getVectorFactor()) * - ScalarTyNumElements), - InsertionIndex); + auto CreateSubVectors = [&](Value *Vec, + SmallVectorImpl &CommonMask) { + for (auto [E, Idx] : SubVectors) { + Value *V = E->VectorizedValue; + if (V->getType()->isIntOrIntVectorTy()) + V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) { + return !isKnownNonNegative( + V, SimplifyQuery(*R.DL)); + })); + unsigned InsertionIndex = Idx * ScalarTyNumElements; + const unsigned SubVecVF = + cast(V->getType())->getNumElements(); + if (InsertionIndex % SubVecVF == 0) { + Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, + Builder.getInt64(InsertionIndex)); + } else { + // Create shuffle, insertvector requires that index is multiple of + // the subvectors length. + const unsigned VecVF = + cast(Vec->getType())->getNumElements(); + SmallVector Mask(VecVF, PoisonMaskElem); + std::iota(Mask.begin(), Mask.end(), 0); + for (unsigned I : seq( + InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements)) + Mask[I] = I - Idx + VecVF; + Vec = createShuffle(Vec, V, Mask); + } + if (!CommonMask.empty()) { + std::iota( + std::next(CommonMask.begin(), InsertionIndex), + std::next(CommonMask.begin(), + (Idx + E->getVectorFactor()) * ScalarTyNumElements), + InsertionIndex); + } + } + return Vec; + }; + if (SubVectorsMask.empty()) { + Vec = CreateSubVectors(Vec, CommonMask); + } else { + SmallVector SVMask(SubVectorsMask.begin(), SubVectorsMask.end()); + for (auto [I1, I2] : zip(SVMask, CommonMask)) { + if (I2 != PoisonMaskElem) { + assert(I1 == PoisonMaskElem && "Expected unused subvectors mask"); + I1 = I2 + CommonMask.size(); + } + } + Value *InsertVec = + CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask); + Vec = createShuffle(InsertVec, Vec, SVMask); + for (unsigned I : seq(CommonMask.size())) { + if (SVMask[I] != PoisonMaskElem) + CommonMask[I] = I; } } InVectors.front() = Vec; @@ -13965,7 +14210,10 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, return std::make_pair(VectorizableTree[P.first].get(), P.second); }); - return ShuffleBuilder.finalize({}, SubVectors); + assert((E->CombinedEntriesWithIndices.empty() || + E->ReorderIndices.empty()) && + "Expected either combined subnodes or reordering"); + return ShuffleBuilder.finalize({}, SubVectors, {}); }; Value *V = vectorizeTree(VE, PostponedPHIs); if (VF * getNumElements(VL[0]->getType()) != @@ -14059,10 +14307,22 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, }); // Build a mask out of the reorder indices and reorder scalars per this // mask. - SmallVector ReorderMask; - inversePermutation(E->ReorderIndices, ReorderMask); + SmallVector ReorderMask(E->ReorderIndices.begin(), + E->ReorderIndices.end()); if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); + SmallVector SubVectorsMask; + inversePermutation(E->ReorderIndices, SubVectorsMask); + // Transform non-clustered elements in the mask to poison (-1). + // "Clustered" operations will be reordered using this mask later. + if (!SubVectors.empty() && !SubVectorsMask.empty()) { + for (unsigned I : seq(GatheredScalars.size())) + if (E->Scalars[I] == GatheredScalars[ReorderMask[I]]) + SubVectorsMask[ReorderMask[I]] = PoisonMaskElem; + } else { + SubVectorsMask.clear(); + } + SmallVector StoredGS(GatheredScalars); auto FindReusedSplat = [&](MutableArrayRef Mask, unsigned InputVF, unsigned I, unsigned SliceSize, bool IsNotPoisonous) { @@ -14147,7 +14407,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, if (I == PoisonMaskElem) continue; if (const auto *TE = getTreeEntry( - cast(E->Scalars[Idx])->getVectorOperand())) + cast(StoredGS[Idx])->getVectorOperand())) ExtractEntries.push_back(TE); } if (std::optional Delayed = @@ -14219,7 +14479,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } ShuffleBuilder.add(*FrontTE, Mask); - Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors); + Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors, + SubVectorsMask); return Res; } if (!Resized) { @@ -14349,7 +14610,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, continue; if (isa(E->Scalars[I])) continue; - auto *EI = cast(E->Scalars[I]); + auto *EI = cast(StoredGS[I]); Value *VecOp = EI->getVectorOperand(); if (const auto *TE = getTreeEntry(VecOp)) if (TE->VectorizedValue) @@ -14480,10 +14741,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, (IsSingleShuffle && ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) && isa(V)); })) - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, + SubVectorsMask); else Res = ShuffleBuilder.finalize( - E->ReuseShuffleIndices, SubVectors, E->Scalars.size(), + E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); @@ -14494,7 +14756,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); ShuffleBuilder.add(BV, ReuseMask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, + SubVectorsMask); } else { // Gather all constants. SmallVector Mask(GatheredScalars.size(), PoisonMaskElem); @@ -14504,7 +14767,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BV = ShuffleBuilder.gather(GatheredScalars); ShuffleBuilder.add(BV, Mask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, + SubVectorsMask); } if (NeedFreeze) @@ -14571,7 +14835,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) { return std::make_pair(VectorizableTree[P.first].get(), P.second); }); - return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + assert( + (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) && + "Expected either combined subnodes or reordering"); + return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {}); }; assert(!E->isGather() && "Unhandled state"); @@ -15989,7 +16256,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, ShuffleBuilder.add(V1, CombinedMask1); if (V2) ShuffleBuilder.add(V2, CombinedMask2); - return ShuffleBuilder.finalize({}, {}); + return ShuffleBuilder.finalize({}, {}, {}); }; auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 6ff03acf85cdf..c976525b6720e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,12 +4,14 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 undef, i32 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP5:%.*]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 6 +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP11]], <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32>