diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f2aa0e8328585..30791bd148bec 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1409,12 +1409,6 @@ class BoUpSLP { /// Construct a vectorizable tree that starts at \p Roots. void buildTree(ArrayRef Roots); - /// Returns whether the root node has in-tree uses. - bool doesRootHaveInTreeUses() const { - return !VectorizableTree.empty() && - !VectorizableTree.front()->UserTreeIndices.empty(); - } - /// Return the scalars of the root node. ArrayRef getRootNodeScalars() const { assert(!VectorizableTree.empty() && "No graph to get the first node from"); @@ -1524,7 +1518,12 @@ class BoUpSLP { /// shuffled vector entry + (possibly) permutation with other gathers. It /// implements the checks only for possibly ordered scalars (Loads, /// ExtractElement, ExtractValue), which can be part of the graph. - std::optional findReusedOrderedScalars(const TreeEntry &TE); + /// \param TopToBottom If true, used for the whole tree rotation, false - for + /// sub-tree rotations. \param IgnoreReorder true, if the order of the root + /// node might be ignored. + std::optional findReusedOrderedScalars(const TreeEntry &TE, + bool TopToBottom, + bool IgnoreReorder); /// Sort loads into increasing pointers offsets to allow greater clustering. std::optional findPartiallyOrderedLoads(const TreeEntry &TE); @@ -1536,8 +1535,14 @@ class BoUpSLP { /// identity order is important, or the actual order. /// \param TopToBottom If true, include the order of vectorized stores and /// insertelement nodes, otherwise skip them. - std::optional getReorderingData(const TreeEntry &TE, - bool TopToBottom); + /// \param IgnoreReorder true, if the root node order can be ignored. + std::optional + getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder); + + /// Checks if it is profitable to reorder the current tree. + /// If the tree does not contain many profitable reordable nodes, better to + /// skip it to save compile time. + bool isProfitableToReorder() const; /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes @@ -1680,6 +1685,8 @@ class BoUpSLP { bool operator == (const EdgeInfo &Other) const { return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx; } + + operator bool() const { return UserTE != nullptr; } }; /// A helper class used for scoring candidates for two consecutive lanes. @@ -2999,8 +3006,10 @@ class BoUpSLP { ArrayRef VL = UserTE->getOperand(OpIdx); TreeEntry *TE = nullptr; const auto *It = find_if(VL, [&](Value *V) { + if (!isa(V)) + return false; for (TreeEntry *E : getTreeEntries(V)) { - if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) { + if (E->UserTreeIndex == EdgeInfo(UserTE, OpIdx)) { TE = E; return true; } @@ -3031,7 +3040,7 @@ class BoUpSLP { /// of a vector of (the same) instruction. TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef Ops); - /// \ returns the graph entry for the \p Idx operand of the \p E entry. + /// \returns the graph entry for the \p Idx operand of the \p E entry. const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const; /// Gets the root instruction for the given node. If the node is a strided @@ -3064,23 +3073,24 @@ class BoUpSLP { bool ResizeAllowed = false) const; /// Vectorize a single entry in the tree. - /// \param PostponedPHIs true, if need to postpone emission of phi nodes to - /// avoid issues with def-use order. - Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs); + Value *vectorizeTree(TreeEntry *E); /// Returns vectorized operand node, that matches the order of the scalars /// operand number \p NodeIdx in entry \p E. - TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx); - const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, - unsigned NodeIdx) const { - return const_cast(this)->getMatchedVectorizedOperand(E, NodeIdx); + TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx, + ArrayRef VL, + const InstructionsState &S); + const TreeEntry * + getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx, + ArrayRef VL, + const InstructionsState &S) const { + return const_cast(this)->getMatchedVectorizedOperand(E, NodeIdx, + VL, S); } /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry /// \p E. - /// \param PostponedPHIs true, if need to postpone emission of phi nodes to - /// avoid issues with def-use order. - Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs); + Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts @@ -3091,8 +3101,7 @@ class BoUpSLP { /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. - Value *createBuildVector(const TreeEntry *E, Type *ScalarTy, - bool PostponedPHIs); + Value *createBuildVector(const TreeEntry *E, Type *ScalarTy); /// Returns the instruction in the bundle, which can be used as a base point /// for scheduling. Usually it is the last instruction in the bundle, except @@ -3249,9 +3258,8 @@ class BoUpSLP { } bool isOperandGatherNode(const EdgeInfo &UserEI) const { - return isGather() && !UserTreeIndices.empty() && - UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && - UserTreeIndices.front().UserTE == UserEI.UserTE; + return isGather() && UserTreeIndex.EdgeIdx == UserEI.EdgeIdx && + UserTreeIndex.UserTE == UserEI.UserTE; } /// \returns true if current entry has same operands as \p TE. @@ -3294,9 +3302,6 @@ class BoUpSLP { /// The Scalars are vectorized into this value. It is initialized to Null. WeakTrackingVH VectorizedValue = nullptr; - /// New vector phi instructions emitted for the vectorized phi nodes. - PHINode *PHI = nullptr; - /// Do we need to gather this sequence or vectorize it /// (either with vector instruction or with scatter/gather /// intrinsics for store/load)? @@ -3333,9 +3338,8 @@ class BoUpSLP { /// to entries. VecTreeTy &Container; - /// The TreeEntry index containing the user of this entry. We can actually - /// have multiple users so the data structure is not truly a tree. - SmallVector UserTreeIndices; + /// The TreeEntry index containing the user of this entry. + EdgeInfo UserTreeIndex; /// The index of this treeEntry in VectorizableTree. unsigned Idx = 0; @@ -3559,9 +3563,11 @@ class BoUpSLP { for (unsigned ReorderIdx : ReorderIndices) dbgs() << ReorderIdx << ", "; dbgs() << "\n"; - dbgs() << "UserTreeIndices: "; - for (const auto &EInfo : UserTreeIndices) - dbgs() << EInfo << ", "; + dbgs() << "UserTreeIndex: "; + if (UserTreeIndex) + dbgs() << UserTreeIndex; + else + dbgs() << ""; dbgs() << "\n"; if (!CombinedEntriesWithIndices.empty()) { dbgs() << "Combined entries: "; @@ -3707,7 +3713,7 @@ class BoUpSLP { } if (UserTreeIdx.UserTE) - Last->UserTreeIndices.push_back(UserTreeIdx); + Last->UserTreeIndex = UserTreeIdx; return Last; } @@ -4469,11 +4475,11 @@ template <> struct GraphTraits { } static ChildIteratorType child_begin(NodeRef N) { - return {N->UserTreeIndices.begin(), N->Container}; + return {&N->UserTreeIndex, N->Container}; } static ChildIteratorType child_end(NodeRef N) { - return {N->UserTreeIndices.end(), N->Container}; + return {&N->UserTreeIndex + 1, N->Container}; } /// For the node iterator we just need to turn the TreeEntry iterator into a @@ -4638,7 +4644,8 @@ static void reorderOrder(SmallVectorImpl &Order, ArrayRef Mask, } std::optional -BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { +BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, + bool TopToBottom, bool IgnoreReorder) { assert(TE.isGather() && "Expected gather node only."); // Try to find subvector extract/insert patterns and reorder only such // patterns. @@ -4664,6 +4671,26 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { if (GatherShuffles.size() == 1 && *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && Entries.front().front()->isSame(TE.Scalars)) { + // If the full matched node in whole tree rotation - no need to consider the + // matching order, rotating the whole tree. + if (TopToBottom) + return std::nullopt; + // No need to keep the order for the same user node. + if (Entries.front().front()->UserTreeIndex.UserTE == + TE.UserTreeIndex.UserTE) + return std::nullopt; + // No need to keep the order for the matched root node, if it can be freely + // reordered. + if (!IgnoreReorder && Entries.front().front()->Idx == 0) + return std::nullopt; + // If shuffling 2 elements only and the matching node has reverse reuses - + // no need to count order, both work fine. + if (!Entries.front().front()->ReuseShuffleIndices.empty() && + TE.getVectorFactor() == 2 && Mask.size() == 2 && + any_of(enumerate(Entries.front().front()->ReuseShuffleIndices), + [](const auto &P) { return P.value() % 2 != P.index() % 2; })) + return std::nullopt; + // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0); @@ -5563,7 +5590,8 @@ static bool areTwoInsertFromSameBuildVector( } std::optional -BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { +BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, + bool IgnoreReorder) { // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) { @@ -5583,7 +5611,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { unsigned Sz = TE.Scalars.size(); if (TE.isGather()) { if (std::optional CurrentOrder = - findReusedOrderedScalars(TE)) { + findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) { SmallVector Mask; fixupOrderingIndices(*CurrentOrder); inversePermutation(*CurrentOrder, Mask); @@ -5688,10 +5716,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { return std::move(ResOrder); } if (TE.State == TreeEntry::StridedVectorize && !TopToBottom && - any_of(TE.UserTreeIndices, - [](const EdgeInfo &EI) { - return !Instruction::isBinaryOp(EI.UserTE->getOpcode()); - }) && + (!TE.UserTreeIndex || + !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) && (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices))) return std::nullopt; if ((TE.State == TreeEntry::Vectorize || @@ -5885,7 +5911,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars // has been auditted for correctness with non-power-of-two vectors. if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) - if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) + if (std::optional CurrentOrder = + findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) return CurrentOrder; } return std::nullopt; @@ -5955,6 +5982,68 @@ static void combineOrders(MutableArrayRef Order, } } +bool BoUpSLP::isProfitableToReorder() const { + constexpr unsigned TinyVF = 2; + constexpr unsigned TinyTree = 10; + constexpr unsigned PhiOpsLimit = 12; + constexpr unsigned GatherLoadsLimit = 2; + if (VectorizableTree.size() <= TinyTree) + return true; + if (VectorizableTree.front()->hasState() && + !VectorizableTree.front()->isGather() && + (VectorizableTree.front()->getOpcode() == Instruction::Store || + VectorizableTree.front()->getOpcode() == Instruction::PHI || + (VectorizableTree.front()->getVectorFactor() <= TinyVF && + (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt || + VectorizableTree.front()->getOpcode() == Instruction::ICmp))) && + VectorizableTree.front()->ReorderIndices.empty()) { + // Check if the tree has only single store and single (unordered) load node, + // other nodes are phis or geps/binops, combined with phis, and/orsingle + // gather load node + bool HasPhis = false; + if (VectorizableTree.front()->getOpcode() == Instruction::PHI && + VectorizableTree.front()->Scalars.size() == TinyVF && + VectorizableTree.front()->getNumOperands() > PhiOpsLimit) + return false; + bool HasLoad = true; + unsigned GatherLoads = 0; + for (const std::unique_ptr &TE : + ArrayRef(VectorizableTree).drop_front()) { + if (!TE->hasState()) { + if (all_of(TE->Scalars, IsaPred) || + all_of(TE->Scalars, IsaPred)) + continue; + if (VectorizableTree.front()->Scalars.size() == TinyVF && + any_of(TE->Scalars, IsaPred)) + continue; + return true; + } + if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) { + if (!TE->isGather()) { + HasLoad = false; + continue; + } + if (HasLoad) + return true; + ++GatherLoads; + if (GatherLoads >= GatherLoadsLimit) + return true; + } + if (TE->getOpcode() == Instruction::GetElementPtr || + Instruction::isBinaryOp(TE->getOpcode())) + continue; + if (TE->getOpcode() != Instruction::PHI) + return true; + if (VectorizableTree.front()->Scalars.size() == TinyVF && + TE->getNumOperands() > PhiOpsLimit) + return false; + HasPhis = true; + } + return !HasPhis; + } + return true; +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -6003,8 +6092,12 @@ void BoUpSLP::reorderTopToBottom() { // TODO: Check the reverse order too. } + bool IgnoreReorder = + !UserIgnoreList && VectorizableTree.front()->hasState() && + (VectorizableTree.front()->getOpcode() == Instruction::InsertElement || + VectorizableTree.front()->getOpcode() == Instruction::Store); if (std::optional CurrentOrder = - getReorderingData(*TE, /*TopToBottom=*/true)) { + getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) { // Do not include ordering for nodes used in the alt opcode vectorization, // better to reorder them during bottom-to-top stage. If follow the order // here, it causes reordering of the whole graph though actually it is @@ -6015,14 +6108,13 @@ void BoUpSLP::reorderTopToBottom() { unsigned Cnt = 0; const TreeEntry *UserTE = TE.get(); while (UserTE && Cnt < RecursionMaxDepth) { - if (UserTE->UserTreeIndices.size() != 1) + if (!UserTE->UserTreeIndex) break; - if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) { - return EI.UserTE->State == TreeEntry::Vectorize && - EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0; - })) + if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && + UserTE->UserTreeIndex.UserTE->isAltShuffle() && + UserTE->UserTreeIndex.UserTE->Idx != 0) return; - UserTE = UserTE->UserTreeIndices.back().UserTE; + UserTE = UserTE->UserTreeIndex.UserTE; ++Cnt; } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); @@ -6168,12 +6260,10 @@ void BoUpSLP::reorderTopToBottom() { // Need to reorder the reuses masks of the operands with smaller VF to // be able to find the match between the graph nodes and scalar // operands of the given node during vectorization/cost estimation. - assert(all_of(TE->UserTreeIndices, - [VF, &TE](const EdgeInfo &EI) { - return EI.UserTE->Scalars.size() == VF || - EI.UserTE->Scalars.size() == - TE->Scalars.size(); - }) && + assert((!TE->UserTreeIndex || + TE->UserTreeIndex.UserTE->Scalars.size() == VF || + TE->UserTreeIndex.UserTE->Scalars.size() == + TE->Scalars.size()) && "All users must be of VF size."); if (SLPReVec) { assert(SLPReVec && "Only supported by REVEC."); @@ -6181,15 +6271,11 @@ void BoUpSLP::reorderTopToBottom() { // because ShuffleVectorInst supports only a limited set of // patterns). Only do reorderNodeWithReuses if all of the users are // not ShuffleVectorInst. - if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) { - return isa(EI.UserTE->getMainOp()); - })) + if (isa(TE->UserTreeIndex.UserTE->getMainOp())) continue; - assert(none_of(TE->UserTreeIndices, - [&](const EdgeInfo &EI) { - return isa( - EI.UserTE->getMainOp()); - }) && + assert((!TE->UserTreeIndex || + !isa( + TE->UserTreeIndex.UserTE->getMainOp())) && "Does not know how to reorder."); } // Update ordering of the operands with the smaller VF than the given @@ -6244,10 +6330,6 @@ bool BoUpSLP::canReorderOperands( })) continue; if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { - // Do not reorder if operand node is used by many user nodes. - if (any_of(TE->UserTreeIndices, - [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) - return false; // Add the node to the list of the ordered nodes with the identity // order. Edges.emplace_back(I, TE); @@ -6268,10 +6350,8 @@ bool BoUpSLP::canReorderOperands( assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && "Only non-vectorized nodes are expected."); - if (any_of(TE->UserTreeIndices, - [UserTE, I](const EdgeInfo &EI) { - return EI.UserTE == UserTE && EI.EdgeIdx == I; - })) { + if (TE->UserTreeIndex.UserTE == UserTE && + TE->UserTreeIndex.EdgeIdx == I) { assert(TE->isSame(UserTE->getOperand(I)) && "Operand entry does not match operands."); Gather = TE; @@ -6299,7 +6379,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { TE->State != TreeEntry::StridedVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = - getReorderingData(*TE, /*TopToBottom=*/false)) { + getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) { OrderedEntries.insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize) || @@ -6312,29 +6392,24 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // I.e., if the node has operands, that are reordered, try to make at least // one operand order in the natural order and reorder others + reorder the // user node itself. - SmallPtrSet Visited; + SmallPtrSet Visited, RevisitedOps; while (!OrderedEntries.empty()) { // 1. Filter out only reordered nodes. - // 2. If the entry has multiple uses - skip it and jump to the next node. DenseMap>> Users; SmallVector Filtered; for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || - TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || - !all_of(drop_begin(TE->UserTreeIndices), - [TE](const EdgeInfo &EI) { - return EI.UserTE == TE->UserTreeIndices.front().UserTE; - }) || + !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || !Visited.insert(TE).second) { Filtered.push_back(TE); continue; } // Build a map between user nodes and their operands order to speedup // search. The graph currently does not provide this dependency directly. - for (EdgeInfo &EI : TE->UserTreeIndices) - Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE); + Users[TE->UserTreeIndex.UserTE].emplace_back(TE->UserTreeIndex.EdgeIdx, + TE); } // Erase filtered entries. for (TreeEntry *TE : Filtered) @@ -6372,7 +6447,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { continue; const auto Order = [&]() -> const OrdersType { if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) - return getReorderingData(*OpTE, /*TopToBottom=*/false) + return getReorderingData(*OpTE, /*TopToBottom=*/false, + IgnoreReorder) .value_or(OrdersType(1)); return OpTE->ReorderIndices; }(); @@ -6380,6 +6456,86 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // orders. if (Order.size() == 1) continue; + + // Check that the reordering does not increase number of shuffles, i.e. + // same-values-nodes has same parents or their parents has same parents. + if (!Order.empty() && !isIdentityOrder(Order)) { + Value *Root = OpTE->hasState() + ? OpTE->getMainOp() + : *find_if_not(OpTE->Scalars, isConstant); + auto GetSameNodesUsers = [&](Value *Root) { + SmallSetVector Res; + for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) { + if (TE != OpTE && TE->UserTreeIndex && + TE->getVectorFactor() == OpTE->getVectorFactor() && + TE->Scalars.size() == OpTE->Scalars.size() && + ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) || + (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars)))) + Res.insert(TE->UserTreeIndex.UserTE); + } + for (const TreeEntry *TE : getTreeEntries(Root)) { + if (TE != OpTE && TE->UserTreeIndex && + TE->getVectorFactor() == OpTE->getVectorFactor() && + TE->Scalars.size() == OpTE->Scalars.size() && + ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) || + (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars)))) + Res.insert(TE->UserTreeIndex.UserTE); + } + return Res.takeVector(); + }; + auto GetNumOperands = [](const TreeEntry *TE) { + if (auto *CI = dyn_cast(TE->getMainOp()); CI) + return CI->arg_size(); + return TE->getNumOperands(); + }; + auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI]( + const TreeEntry *TE) { + Intrinsic::ID ID = Intrinsic::not_intrinsic; + if (auto *CI = dyn_cast(TE->getMainOp()); CI) + ID = getVectorIntrinsicIDForCall(CI, TLI); + for (unsigned Idx : seq(GetNumOperands(TE))) { + if (ID != Intrinsic::not_intrinsic && + isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) + continue; + const TreeEntry *Op = getOperandEntry(TE, Idx); + if (Op->isGather() && Op->hasState()) { + const TreeEntry *VecOp = + getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars); + if (VecOp) + Op = VecOp; + } + if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty()) + return false; + } + return true; + }; + SmallVector Users = GetSameNodesUsers(Root); + if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) { + if (!RevisitedOps.insert(UTE).second) + return false; + return UTE == Data.first || !UTE->ReorderIndices.empty() || + !UTE->ReuseShuffleIndices.empty() || + (UTE->UserTreeIndex && + UTE->UserTreeIndex.UserTE == Data.first) || + (Data.first->UserTreeIndex && + Data.first->UserTreeIndex.UserTE == UTE) || + NodeShouldBeReorderedWithOperands(UTE); + })) + continue; + for (TreeEntry *UTE : Users) { + Intrinsic::ID ID = Intrinsic::not_intrinsic; + if (auto *CI = dyn_cast(UTE->getMainOp()); CI) + ID = getVectorIntrinsicIDForCall(CI, TLI); + for (unsigned Idx : seq(GetNumOperands(UTE))) { + if (ID != Intrinsic::not_intrinsic && + isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) + continue; + const TreeEntry *Op = getOperandEntry(UTE, Idx); + Visited.erase(Op); + OrderedEntries.insert(const_cast(Op)); + } + } + } unsigned NumOps = count_if( Data.second, [OpTE](const std::pair &P) { return P.second == OpTE; @@ -6411,15 +6567,16 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { return true; if (TE->isGather()) { if (GathersToOrders.contains(TE)) - return !getReorderingData(*TE, /*TopToBottom=*/false) + return !getReorderingData(*TE, /*TopToBottom=*/false, + IgnoreReorder) .value_or(OrdersType(1)) .empty(); return true; } return false; }; - for (const EdgeInfo &EI : OpTE->UserTreeIndices) { - TreeEntry *UserTE = EI.UserTE; + if (OpTE->UserTreeIndex) { + TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE; if (!VisitedUsers.insert(UserTE).second) continue; // May reorder user node if it requires reordering, has reused @@ -6437,10 +6594,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Ops, [UserTE, &AllowsReordering]( const std::pair &Op) { return AllowsReordering(Op.second) && - all_of(Op.second->UserTreeIndices, - [UserTE](const EdgeInfo &EI) { - return EI.UserTE == UserTE; - }); + Op.second->UserTreeIndex.UserTE == UserTE; })) <= Ops.size() / 2) ++Res.first->second; } @@ -7738,8 +7892,16 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( return TreeEntry::Vectorize; } - case Instruction::ExtractValue: - case Instruction::ExtractElement: { + case Instruction::ExtractElement: + if (any_of(VL, [&](Value *V) { + auto *EI = dyn_cast(V); + if (!EI) + return true; + return isVectorized(EI->getOperand(0)); + })) + return TreeEntry::NeedToGather; + [[fallthrough]]; + case Instruction::ExtractValue: { bool Reuse = canReuseExtract(VL, CurrentOrder); // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and // non-full registers). @@ -8265,10 +8427,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n"); for (TreeEntry *E : getTreeEntries(S.getMainOp())) { if (E->isSame(VL)) { - // Record the reuse of the tree node. - E->UserTreeIndices.push_back(UserTreeIdx); LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp() << ".\n"); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); return; } SmallPtrSet Values(E->Scalars.begin(), E->Scalars.end()); @@ -9960,22 +10123,17 @@ void BoUpSLP::transformNodes() { }; for (auto [Cnt, Sz] : Slices) { ArrayRef Slice = VL.slice(Cnt, Sz); + const TreeEntry *SameTE = nullptr; if (const auto *It = find_if(Slice, IsaPred); It != Slice.end()) { // If any instruction is vectorized already - do not try again. - if (TreeEntry *SE = getSameValuesTreeEntry(*It, Slice)) { - if (SE->getVectorFactor() != Sz) - continue; - SE->UserTreeIndices.emplace_back(&E, UINT_MAX); - AddCombinedNode(SE->Idx, Cnt, Sz); - continue; - } + SameTE = getSameValuesTreeEntry(*It, Slice); } unsigned PrevSize = VectorizableTree.size(); [[maybe_unused]] unsigned PrevEntriesSize = LoadEntriesToVectorize.size(); buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX)); - if (PrevSize + 1 == VectorizableTree.size() && + if (PrevSize + 1 == VectorizableTree.size() && !SameTE && VectorizableTree[PrevSize]->isGather() && VectorizableTree[PrevSize]->hasState() && VectorizableTree[PrevSize]->getOpcode() != @@ -10089,7 +10247,7 @@ void BoUpSLP::transformNodes() { // This node is a minmax node. E.CombinedOp = TreeEntry::MinMax; TreeEntry *CondEntry = const_cast(getOperandEntry(&E, 0)); - if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 && + if (SelectOnly && CondEntry->UserTreeIndex && CondEntry->State == TreeEntry::Vectorize) { // The condition node is part of the combined minmax node. CondEntry->State = TreeEntry::CombinedVectorize; @@ -11079,15 +11237,30 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, unsigned Idx) const { - if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx)) + ArrayRef VL = E->getOperand(Idx); + InstructionsState S = getSameOpcode(VL, *TLI); + // Special processing for GEPs bundle, which may include non-gep values. + if (!S && VL.front()->getType()->isPointerTy()) { + const auto *It = find_if(VL, IsaPred); + if (It != VL.end()) + S = getSameOpcode(*It, *TLI); + } + if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx, VL, S)) return VE; - const auto *It = - find_if(VectorizableTree, [&](const std::unique_ptr &TE) { - return TE->isGather() && - find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.EdgeIdx == Idx && EI.UserTE == E; - }) != TE->UserTreeIndices.end(); - }); + if (S || !isConstant(VL.front())) { + for (const TreeEntry *VE : + ValueToGatherNodes.lookup(S ? S.getMainOp() : VL.front())) + if (VE->UserTreeIndex.EdgeIdx == Idx && VE->UserTreeIndex.UserTE == E) { + assert(VE->isSame(VL) && "Expected gather node with same values."); + return VE; + } + } + const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1), + [&](const std::unique_ptr &TE) { + return TE->isGather() && + TE->UserTreeIndex.EdgeIdx == Idx && + TE->UserTreeIndex.UserTE == E; + }); assert(It != VectorizableTree.end() && "Expected vectorizable entry."); return It->get(); } @@ -11216,12 +11389,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // resized. if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0 && - (E->getOpcode() != Instruction::Load || - !E->UserTreeIndices.empty())) { - const EdgeInfo &EI = - *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) { - return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX; - }); + (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) { + const EdgeInfo &EI = E->UserTreeIndex; if (EI.UserTE->getOpcode() != Instruction::Select || EI.EdgeIdx != 0) { auto UserBWIt = MinBWs.find(EI.UserTE); @@ -12550,7 +12719,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { // Exclude cost of gather loads nodes which are not used. These nodes were // built as part of the final attempt to vectorize gathered loads. - assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) && + assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) && "Expected gather nodes with users only."); InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); @@ -13153,7 +13322,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // reused elements too for better cost estimation. const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get() ? EdgeInfo(const_cast(TE), 0) - : TE->UserTreeIndices.front(); + : TE->UserTreeIndex; const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE); const BasicBlock *TEInsertBlock = nullptr; // Main node of PHI entries keeps the correct order of operands/incoming @@ -13207,20 +13376,36 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // have a permutation of 2 input vectors. SmallVector> UsedTEs; DenseMap UsedValuesEntry; + SmallPtrSet VisitedValue; + auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) { + // The node is reused - exit. + if ((TEPtr->getVectorFactor() != VL.size() && + TEPtr->Scalars.size() != VL.size()) || + (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars))) + return false; + UsedTEs.clear(); + UsedTEs.emplace_back().insert(TEPtr); + for (Value *V : VL) { + if (isConstant(V)) + continue; + UsedValuesEntry.try_emplace(V, 0); + } + return true; + }; for (Value *V : VL) { - if (isConstant(V)) + if (isConstant(V) || !VisitedValue.insert(V).second) continue; // Build a list of tree entries where V is used. SmallPtrSet VToTEs; - for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) { + for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) { if (TEPtr == TE || TEPtr->Idx == 0) continue; assert(any_of(TEPtr->Scalars, [&](Value *V) { return GatheredScalars.contains(V); }) && "Must contain at least single gathered value."); - assert(TEPtr->UserTreeIndices.size() == 1 && + assert(TEPtr->UserTreeIndex && "Expected only single user of a gather node."); - const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front(); + const EdgeInfo &UseEI = TEPtr->UserTreeIndex; PHINode *UserPHI = dyn_cast(UseEI.UserTE->getMainOp()); const Instruction *InsertPt = @@ -13234,16 +13419,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( continue; // If the user instruction is used for some reason in different // vectorized nodes - make it depend on index. - // If any vector node is PHI node, this dependency might not work - // because of cycle dependencies, so disable it. if (TEUseEI.UserTE != UseEI.UserTE && - (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx || - any_of( - VectorizableTree, - [](const std::unique_ptr &TE) { - return TE->State == TreeEntry::Vectorize && - TE->getOpcode() == Instruction::PHI; - }))) + TEUseEI.UserTE->Idx < UseEI.UserTE->Idx) continue; } @@ -13253,6 +13430,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && !CheckOrdering(InsertPt)) continue; + // The node is reused - exit. + if (CheckAndUseSameNode(TEPtr)) + break; VToTEs.insert(TEPtr); } if (ArrayRef VTEs = getTreeEntries(V); !VTEs.empty()) { @@ -13274,6 +13454,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) continue; } + // The node is reused - exit. + if (CheckAndUseSameNode(VTE)) + break; VToTEs.insert(VTE); } if (VToTEs.empty()) @@ -13686,18 +13869,17 @@ BoUpSLP::isGatherShuffledEntry( if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) return {}; Mask.assign(VL.size(), PoisonMaskElem); - assert((TE->UserTreeIndices.size() == 1 || - TE == VectorizableTree.front().get()) && + assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) && "Expected only single user of the gather node."); assert(VL.size() % NumParts == 0 && "Number of scalars must be divisible by NumParts."); - if (!TE->UserTreeIndices.empty() && - TE->UserTreeIndices.front().UserTE->isGather() && - TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) { + if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() && + TE->UserTreeIndex.EdgeIdx == UINT_MAX) { assert( (TE->Idx == 0 || (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) || - isSplat(TE->Scalars)) && + isSplat(TE->Scalars) || + getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars)) && "Expected splat or extractelements only node."); return {}; } @@ -14324,11 +14506,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { (!UTEs.empty() && count_if(R.VectorizableTree, [&](const std::unique_ptr &TE) { - return any_of(TE->UserTreeIndices, - [&](const EdgeInfo &Edge) { - return Edge.UserTE == - UTEs.front(); - }) && + return TE->UserTreeIndex.UserTE == + UTEs.front() && is_contained(VL, EI); }) != 1); })) @@ -14691,41 +14870,30 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { } }; -BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, - unsigned NodeIdx) { - ArrayRef VL = E->getOperand(NodeIdx); - InstructionsState S = getSameOpcode(VL, *TLI); - // Special processing for GEPs bundle, which may include non-gep values. - if (!S && VL.front()->getType()->isPointerTy()) { - const auto *It = find_if(VL, IsaPred); - if (It != VL.end()) - S = getSameOpcode(*It, *TLI); - } +BoUpSLP::TreeEntry * +BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx, + ArrayRef VL, + const InstructionsState &S) { if (!S) return nullptr; - auto CheckSameVE = [&](const TreeEntry *VE) { - return any_of(VE->UserTreeIndices, - [E, NodeIdx](const EdgeInfo &EI) { - return EI.UserTE == E && EI.EdgeIdx == NodeIdx; - }) || - any_of(VectorizableTree, - [E, NodeIdx, VE](const std::unique_ptr &TE) { - return TE->isOperandGatherNode( - {const_cast(E), NodeIdx}) && - VE->isSame(TE->Scalars); - }); - }; - TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL); - if (VE && CheckSameVE(VE)) + if (TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL); + VE && VE->UserTreeIndex.UserTE == E && + VE->UserTreeIndex.EdgeIdx == NodeIdx) return VE; return nullptr; } -Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, - bool PostponedPHIs) { +Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { ValueList &VL = E->getOperand(NodeIdx); + InstructionsState S = getSameOpcode(VL, *TLI); + // Special processing for GEPs bundle, which may include non-gep values. + if (!S && VL.front()->getType()->isPointerTy()) { + const auto *It = find_if(VL, IsaPred); + if (It != VL.end()) + S = getSameOpcode(*It, *TLI); + } const unsigned VF = VL.size(); - if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) { + if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx, VL, S)) { auto FinalShuffle = [&](Value *V, ArrayRef Mask) { // V may be affected by MinBWs. // We want ShuffleInstructionBuilder to correctly support REVEC. The key @@ -14749,7 +14917,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, "Expected either combined subnodes or reordering"); return ShuffleBuilder.finalize({}, SubVectors, {}); }; - Value *V = vectorizeTree(VE, PostponedPHIs); + Value *V = vectorizeTree(VE); if (VF * getNumElements(VL[0]->getType()) != cast(V->getType())->getNumElements()) { if (!VE->ReuseShuffleIndices.empty()) { @@ -14790,14 +14958,13 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, // Need to update the operand gather node, if actually the operand is not a // vectorized node, but the buildvector/gather node, which matches one of // the vectorized nodes. - if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.UserTE == E && EI.EdgeIdx == NodeIdx; - }) == VE->UserTreeIndices.end()) { - auto *It = - find_if(VectorizableTree, [&](const std::unique_ptr &TE) { - return TE->isGather() && TE->UserTreeIndices.front().UserTE == E && - TE->UserTreeIndices.front().EdgeIdx == NodeIdx; - }); + if (VE->UserTreeIndex.UserTE != E || VE->UserTreeIndex.EdgeIdx != NodeIdx) { + auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1), + [&](const std::unique_ptr &TE) { + return TE->isGather() && + TE->UserTreeIndex.UserTE == E && + TE->UserTreeIndex.EdgeIdx == NodeIdx; + }); assert(It != VectorizableTree.end() && "Expected gather node operand."); (*It)->VectorizedValue = V; } @@ -14807,15 +14974,15 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, // Find the corresponding gather entry and vectorize it. // Allows to be more accurate with tree/graph transformations, checks for the // correctness of the transformations in many cases. - auto *I = find_if(VectorizableTree, + auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1), [E, NodeIdx](const std::unique_ptr &TE) { return TE->isOperandGatherNode({E, NodeIdx}); }); assert(I != VectorizableTree.end() && "Gather node is not in the graph."); - assert(I->get()->UserTreeIndices.size() == 1 && + assert(I->get()->UserTreeIndex && "Expected only single user for the gather node."); assert(I->get()->isSame(VL) && "Expected same list of scalars."); - return vectorizeTree(I->get(), PostponedPHIs); + return vectorizeTree(I->get()); } template @@ -14864,17 +15031,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, return isa(V) && !isa(V); })) return false; - TreeEntry *UserTE = E->UserTreeIndices.back().UserTE; - unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx; + TreeEntry *UserTE = E->UserTreeIndex.UserTE; + unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx; if (UserTE->getNumOperands() != 2) return false; if (!IsNotPoisonous) { - auto *It = - find_if(VectorizableTree, [=](const std::unique_ptr &TE) { - return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) { - return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx; - }) != TE->UserTreeIndices.end(); - }); + auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1), + [=](const std::unique_ptr &TE) { + return TE->UserTreeIndex.UserTE == UserTE && + TE->UserTreeIndex.EdgeIdx != EdgeIdx; + }); if (It == VectorizableTree.end()) return false; SmallVector GS((*It)->Scalars.begin(), (*It)->Scalars.end()); @@ -15088,15 +15254,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, auto *It = find_if(Scalars, [this, E](Value *V) { return !isa(V) && (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) || - (E->UserTreeIndices.size() == 1 && - any_of(V->uses(), [E](const Use &U) { + (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) { // Check if the value already used in the same operation in // one of the nodes already. - return E->UserTreeIndices.front().EdgeIdx != - U.getOperandNo() && - is_contained( - E->UserTreeIndices.front().UserTE->Scalars, - U.getUser()); + return E->UserTreeIndex.EdgeIdx != U.getOperandNo() && + is_contained(E->UserTreeIndex.UserTE->Scalars, + U.getUser()); }))); }); if (It != Scalars.end()) { @@ -15311,10 +15474,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, return Res; } -Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy, - bool PostponedPHIs) { +Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { for (auto [EIdx, _] : E->CombinedEntriesWithIndices) - (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs); + (void)vectorizeTree(VectorizableTree[EIdx].get()); return processBuildVector(E, ScalarTy, Builder, *this); } @@ -15329,16 +15491,9 @@ static Instruction *propagateMetadata(Instruction *Inst, ArrayRef VL) { return llvm::propagateMetadata(Inst, Insts); } -Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilderBase::InsertPointGuard Guard(Builder); - if (E->VectorizedValue && - (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI || - E->isAltShuffle())) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); - return E->VectorizedValue; - } - Value *V = E->Scalars.front(); Type *ScalarTy = V->getType(); if (!isa(V)) @@ -15355,7 +15510,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { // Set insert point for non-reduction initial nodes. if (E->hasState() && E->Idx == 0 && !UserIgnoreList) setInsertPointAfterBundle(E); - Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs); + Value *Vec = createBuildVector(E, ScalarTy); E->VectorizedValue = Vec; return Vec; } @@ -15408,32 +15563,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { switch (ShuffleOrOp) { case Instruction::PHI: { assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() || - E != VectorizableTree.front().get() || - !E->UserTreeIndices.empty()) && + E != VectorizableTree.front().get() || E->UserTreeIndex) && "PHI reordering is free."); - if (PostponedPHIs && E->VectorizedValue) - return E->VectorizedValue; auto *PH = cast(VL0); Builder.SetInsertPoint(PH->getParent(), PH->getParent()->getFirstNonPHIIt()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - if (PostponedPHIs || !E->VectorizedValue) { - PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); - E->PHI = NewPhi; - Value *V = NewPhi; - - // Adjust insertion point once all PHI's have been generated. - Builder.SetInsertPoint(PH->getParent(), - PH->getParent()->getFirstInsertionPt()); - Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); + Value *V = NewPhi; - V = FinalShuffle(V, E); + // Adjust insertion point once all PHI's have been generated. + Builder.SetInsertPoint(PH->getParent(), + PH->getParent()->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - E->VectorizedValue = V; - if (PostponedPHIs) - return V; - } - PHINode *NewPhi = cast(E->PHI); + V = FinalShuffle(V, E); + + E->VectorizedValue = V; // If phi node is fully emitted - exit. if (NewPhi->getNumIncomingValues() != 0) return NewPhi; @@ -15459,7 +15605,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true); + Value *Vec = vectorizeOperand(E, I); if (VecTy != Vec->getType()) { assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() || MinBWs.contains(getOperandEntry(E, I))) && @@ -15477,8 +15623,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::ExtractElement: { Value *V = E->getSingleOperand(0); - if (ArrayRef TEs = getTreeEntries(V); !TEs.empty()) - V = TEs.front()->VectorizedValue; setInsertPointAfterBundle(E); V = FinalShuffle(V, E); E->VectorizedValue = V; @@ -15497,7 +15641,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::InsertElement: { assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); Builder.SetInsertPoint(cast(E->Scalars.back())); - Value *V = vectorizeOperand(E, 1, PostponedPHIs); + Value *V = vectorizeOperand(E, 1); ArrayRef Op = E->getOperand(1); Type *ScalarTy = Op.front()->getType(); if (cast(V->getType())->getElementType() != ScalarTy) { @@ -15669,11 +15813,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::BitCast: { setInsertPointAfterBundle(E); - Value *InVec = vectorizeOperand(E, 0, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *InVec = vectorizeOperand(E, 0); auto *CI = cast(VL0); Instruction::CastOps VecOpcode = CI->getOpcode(); @@ -15716,16 +15856,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::ICmp: { setInsertPointAfterBundle(E); - Value *L = vectorizeOperand(E, 0, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } - Value *R = vectorizeOperand(E, 1, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *L = vectorizeOperand(E, 0); + Value *R = vectorizeOperand(E, 1); if (L->getType() != R->getType()) { assert((getOperandEntry(E, 0)->isGather() || getOperandEntry(E, 1)->isGather() || @@ -15761,21 +15893,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::Select: { setInsertPointAfterBundle(E); - Value *Cond = vectorizeOperand(E, 0, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } - Value *True = vectorizeOperand(E, 1, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } - Value *False = vectorizeOperand(E, 2, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *Cond = vectorizeOperand(E, 0); + Value *True = vectorizeOperand(E, 1); + Value *False = vectorizeOperand(E, 2); if (True->getType() != VecTy || False->getType() != VecTy) { assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() || getOperandEntry(E, 2)->isGather() || @@ -15814,12 +15934,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::FNeg: { setInsertPointAfterBundle(E); - Value *Op = vectorizeOperand(E, 0, PostponedPHIs); - - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *Op = vectorizeOperand(E, 0); Value *V = Builder.CreateUnOp( static_cast(E->getOpcode()), Op); @@ -15837,12 +15952,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::Freeze: { setInsertPointAfterBundle(E); - Value *Op = vectorizeOperand(E, 0, PostponedPHIs); - - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *Op = vectorizeOperand(E, 0); if (Op->getType() != VecTy) { assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() || @@ -15878,16 +15988,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { case Instruction::Xor: { setInsertPointAfterBundle(E); - Value *LHS = vectorizeOperand(E, 0, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } - Value *RHS = vectorizeOperand(E, 1, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *LHS = vectorizeOperand(E, 0); + Value *RHS = vectorizeOperand(E, 1); if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { for (unsigned I : seq(0, E->getNumOperands())) { ArrayRef Ops = E->getOperand(I); @@ -15988,11 +16090,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { NewLI = Inst; } else { assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); - Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *VecPtr = vectorizeOperand(E, 0); if (isa(ScalarTy)) { assert(SLPReVec && "FixedVectorType is not expected."); // CreateMaskedGather expects VecTy and VecPtr have same size. We need @@ -16030,7 +16128,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { setInsertPointAfterBundle(E); - Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs); + Value *VecValue = vectorizeOperand(E, 0); if (VecValue->getType() != VecTy) VecValue = Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0)); @@ -16073,19 +16171,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { auto *GEP0 = cast(VL0); setInsertPointAfterBundle(E); - Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *Op0 = vectorizeOperand(E, 0); SmallVector OpVecs; for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { - Value *OpVec = vectorizeOperand(E, J, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *OpVec = vectorizeOperand(E, J); OpVecs.push_back(OpVec); } @@ -16143,11 +16233,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { continue; } - Value *OpVec = vectorizeOperand(E, I, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *OpVec = vectorizeOperand(E, I); ScalarArg = CEI->getArgOperand(I); if (cast(OpVec->getType())->getElementType() != ScalarArg->getType()->getScalarType() && @@ -16191,11 +16277,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Value *V; if (SLPReVec && !E->isAltShuffle()) { setInsertPointAfterBundle(E); - Value *Src = vectorizeOperand(E, 0, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } + Value *Src = vectorizeOperand(E, 0); SmallVector ThisMask(calculateShufflevectorMask(E->Scalars)); if (auto *SVSrc = dyn_cast(Src)) { assert(isa(SVSrc->getOperand(1)) && @@ -16224,19 +16306,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Value *LHS = nullptr, *RHS = nullptr; if (Instruction::isBinaryOp(E->getOpcode()) || isa(VL0)) { setInsertPointAfterBundle(E); - LHS = vectorizeOperand(E, 0, PostponedPHIs); - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; - } - RHS = vectorizeOperand(E, 1, PostponedPHIs); + LHS = vectorizeOperand(E, 0); + RHS = vectorizeOperand(E, 1); } else { setInsertPointAfterBundle(E); - LHS = vectorizeOperand(E, 0, PostponedPHIs); - } - if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); - return E->VectorizedValue; + LHS = vectorizeOperand(E, 0); } if (LHS && RHS && ((Instruction::isBinaryOp(E->getOpcode()) && @@ -16387,38 +16461,24 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, // gathered loads. for (const std::unique_ptr &TE : VectorizableTree) { if (GatheredLoadsEntriesFirst.has_value() && - TE->Idx >= *GatheredLoadsEntriesFirst && - (!TE->isGather() || !TE->UserTreeIndices.empty())) { - assert((!TE->UserTreeIndices.empty() || + TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue && + (!TE->isGather() || TE->UserTreeIndex)) { + assert((TE->UserTreeIndex || (TE->getOpcode() == Instruction::Load && !TE->isGather())) && "Expected gathered load node."); - (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false); + (void)vectorizeTree(TE.get()); } } - // Postpone emission of PHIs operands to avoid cyclic dependencies issues. - (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true); - for (const std::unique_ptr &TE : VectorizableTree) - if (TE->State == TreeEntry::Vectorize && - TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() && - TE->VectorizedValue) - (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false); + (void)vectorizeTree(VectorizableTree[0].get()); // Run through the list of postponed gathers and emit them, replacing the temp // emitted allocas with actual vector instructions. ArrayRef PostponedNodes = PostponedGathers.getArrayRef(); DenseMap> PostponedValues; for (const TreeEntry *E : PostponedNodes) { auto *TE = const_cast(E); - if (auto *VecTE = getSameValuesTreeEntry( - TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand( - TE->UserTreeIndices.front().EdgeIdx)); - VecTE && VecTE->isSame(TE->Scalars)) - // Found gather node which is absolutely the same as one of the - // vectorized nodes. It may happen after reordering. - continue; auto *PrevVec = cast(TE->VectorizedValue); TE->VectorizedValue = nullptr; - auto *UserI = - cast(TE->UserTreeIndices.front().UserTE->VectorizedValue); + auto *UserI = cast(TE->UserTreeIndex.UserTE->VectorizedValue); // If user is a PHI node, its vector code have to be inserted right before // block terminator. Since the node was delayed, there were some unresolved // dependencies at the moment when stab instruction was emitted. In a case @@ -16444,7 +16504,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, Builder.SetInsertPoint(PrevVec); } Builder.SetCurrentDebugLocation(UserI->getDebugLoc()); - Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false); + Value *Vec = vectorizeTree(TE); if (auto *VecI = dyn_cast(Vec); VecI && VecI->getParent() == Builder.GetInsertBlock() && Builder.GetInsertPoint()->comesBefore(VecI)) @@ -16490,7 +16550,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, } if (IsSigned.value_or(false)) { // Final attempt - check user node. - auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE); + auto It = MinBWs.find(TE->UserTreeIndex.UserTE); if (It != MinBWs.end()) IsSigned = It->second.second; } @@ -16956,15 +17016,11 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, for (Instruction *I : RemovedInsts) { const TreeEntry *IE = getTreeEntries(I).front(); if (IE->Idx != 0 && - !(VectorizableTree.front()->isGather() && - !IE->UserTreeIndices.empty() && + !(VectorizableTree.front()->isGather() && IE->UserTreeIndex && (ValueToGatherNodes.lookup(I).contains( VectorizableTree.front().get()) || - any_of(IE->UserTreeIndices, - [&](const EdgeInfo &EI) { - return EI.UserTE == VectorizableTree.front().get() && - EI.EdgeIdx == UINT_MAX; - }))) && + (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() && + IE->UserTreeIndex.EdgeIdx == UINT_MAX))) && !(GatheredLoadsEntriesFirst.has_value() && IE->Idx >= *GatheredLoadsEntriesFirst && VectorizableTree.front()->isGather() && @@ -17872,6 +17928,18 @@ bool BoUpSLP::collectValuesToDemote( E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth))); // Demote gathers. if (Res && E.isGather()) { + if (E.hasState()) { + if (const TreeEntry *SameTE = + getSameValuesTreeEntry(E.getMainOp(), E.Scalars); + SameTE) + if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth, + ToDemote, Visited, NodesToKeepBWs, + MaxDepthLevel, IsProfitableToDemote, + IsTruncRoot)) { + ToDemote.push_back(E.Idx); + return true; + } + } // Check possible extractelement instructions bases and final vector // length. SmallPtrSet UniqueBases; @@ -17884,13 +17952,15 @@ bool BoUpSLP::collectValuesToDemote( const unsigned VF = E.Scalars.size(); Type *OrigScalarTy = E.Scalars.front()->getType(); if (UniqueBases.size() <= 2 || - ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) == + ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >= ::getNumberOfParts( *TTI, getWidenedType( IntegerType::get(OrigScalarTy->getContext(), BitWidth), - VF))) + VF))) { ToDemote.push_back(E.Idx); + return true; + } } return Res; }; @@ -17966,12 +18036,6 @@ bool BoUpSLP::collectValuesToDemote( (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth))); } else { - // Several vectorized uses? Check if we can truncate it, otherwise - - // exit. - if (E.UserTreeIndices.size() > 1 && - !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1, - std::ref(BitWidth)))) - return false; bool NeedToExit = false; if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit)) return false; @@ -18091,8 +18155,7 @@ bool BoUpSLP::collectValuesToDemote( BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)}); } - // We can demote phis if we can demote all their incoming operands. Note that - // we don't need to worry about cycles since we ensure single use above. + // We can demote phis if we can demote all their incoming operands. case Instruction::PHI: { const unsigned NumOps = E.getNumOperands(); SmallVector Ops(NumOps); @@ -18213,13 +18276,9 @@ void BoUpSLP::computeMinimumValueSizes() { NodeIdx = 1; // Ensure the roots of the vectorizable tree don't form a cycle. - if (VectorizableTree[NodeIdx]->isGather() || - (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) || - (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices, - [NodeIdx](const EdgeInfo &EI) { - return EI.UserTE->Idx > NodeIdx; - }))) - return; + assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 || + !VectorizableTree[NodeIdx]->UserTreeIndex) && + "Unexpected tree is graph."); // The first value node for store/insertelement is sext/zext/trunc? Skip it, // resize to the final type. @@ -18248,7 +18307,7 @@ void BoUpSLP::computeMinimumValueSizes() { ToDemote.clear(); // Check if the root is trunc and the next node is gather/buildvector, then // keep trunc in scalars, which is free in most cases. - if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && + if (E.isGather() && IsTruncRoot && E.UserTreeIndex && !NodesToKeepBWs.contains(E.Idx) && E.Idx > (IsStoreOrInsertElt ? 2u : 1u) && all_of(E.Scalars, [&](Value *V) { @@ -18256,7 +18315,7 @@ void BoUpSLP::computeMinimumValueSizes() { (!V->hasNUsesOrMore(UsesLimit) && none_of(V->users(), [&](User *U) { ArrayRef TEs = getTreeEntries(U); - const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; + const TreeEntry *UserTE = E.UserTreeIndex.UserTE; if (TEs.empty() || is_contained(TEs, UserTE)) return false; if (!isasecond.first; @@ -18492,28 +18551,26 @@ void BoUpSLP::computeMinimumValueSizes() { NodeIdx = NewIdx; IsTruncRoot = NodeIdx < VectorizableTree.size() && - any_of(VectorizableTree[NodeIdx]->UserTreeIndices, - [](const EdgeInfo &EI) { - return EI.EdgeIdx == 0 && - EI.UserTE->getOpcode() == Instruction::Trunc && - !EI.UserTE->isAltShuffle(); - }); + VectorizableTree[NodeIdx]->UserTreeIndex && + VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 && + VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() == + Instruction::Trunc && + !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle(); IsSignedCmp = NodeIdx < VectorizableTree.size() && + VectorizableTree[NodeIdx]->UserTreeIndex && + VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() && + VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() == + Instruction::ICmp && any_of( - VectorizableTree[NodeIdx]->UserTreeIndices, - [&](const EdgeInfo &EI) { - return EI.UserTE && EI.UserTE->hasState() && - EI.UserTE->getOpcode() == Instruction::ICmp && - any_of(EI.UserTE->Scalars, [&](Value *V) { - auto *IC = dyn_cast(V); - return IC && - (IC->isSigned() || - !isKnownNonNegative(IC->getOperand(0), - SimplifyQuery(*DL)) || - !isKnownNonNegative(IC->getOperand(1), - SimplifyQuery(*DL))); - }); + VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars, + [&](Value *V) { + auto *IC = dyn_cast(V); + return IC && (IC->isSigned() || + !isKnownNonNegative(IC->getOperand(0), + SimplifyQuery(*DL)) || + !isKnownNonNegative(IC->getOperand(1), + SimplifyQuery(*DL))); }); } @@ -18709,8 +18766,10 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, Size = R.getCanonicalGraphSize(); return false; } - R.reorderTopToBottom(); - R.reorderBottomToTop(); + if (R.isProfitableToReorder()) { + R.reorderTopToBottom(); + R.reorderBottomToTop(); + } R.transformNodes(); R.buildExternalUses(); @@ -18767,7 +18826,7 @@ static bool checkTreeSizes(ArrayRef> Sizes, return V + (P - Mean) * (P - Mean); }) / Num; - return Dev * 81 / (Mean * Mean) == 0; + return Dev * 96 / (Mean * Mean) == 0; } bool SLPVectorizerPass::vectorizeStores( @@ -19303,10 +19362,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, R.buildTree(Ops); if (R.isTreeTinyAndNotFullyVectorizable()) continue; - R.reorderTopToBottom(); - R.reorderBottomToTop( - /*IgnoreReorder=*/!isa(Ops.front()) && - !R.doesRootHaveInTreeUses()); + if (R.isProfitableToReorder()) { + R.reorderTopToBottom(); + R.reorderBottomToTop(!isa(Ops.front())); + } R.transformNodes(); R.buildExternalUses(); @@ -20243,7 +20302,7 @@ class HorizontalReduction { } V.reorderTopToBottom(); // No need to reorder the root node at all. - V.reorderBottomToTop(!V.doesRootHaveInTreeUses()); + V.reorderBottomToTop(/*IgnoreReorder=*/true); // Keep extracted other reduction values, if they are used in the // vectorization trees. BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll index 8dc0181425625..7e9055e1405fa 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -17,7 +17,7 @@ target triple = "aarch64--linux" ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-19' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '8' +; YAML-NEXT: - TreeSize: '10' define i32 @test_select(ptr noalias nocapture readonly %blk1, ptr noalias nocapture readonly %blk2, i32 %lx, i32 %h) { ; CHECK-LABEL: @test_select( @@ -230,7 +230,7 @@ for.end: ; preds = %for.end.loopexit, % ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-44' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '10' +; YAML-NEXT: - TreeSize: '12' define i32 @test_unrolled_select(ptr noalias nocapture readonly %blk1, ptr noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 { ; CHECK-LABEL: @test_unrolled_select( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll index 219496fc1ac9b..dcdfc6efbfb92 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll @@ -7,7 +7,7 @@ define void @f(ptr %r, ptr %w) { %add0 = fadd double %f0, %f0 %add1 = fadd double %f1, %f1 %w1 = getelementptr inbounds double, ptr %w, i64 1 -; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 3 +; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 4 store double %add0, ptr %w, !dbg !9 store double %add1, ptr %w1 ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll index f5e904467baa7..b1bd2546c26f4 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll @@ -6,7 +6,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[V1:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: br label [[FOR_COND15_PREHEADER:%.*]] ; CHECK: for.cond15.preheader: ; CHECK-NEXT: br label [[IF_END:%.*]] @@ -15,7 +15,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) { ; CHECK: if.end: ; CHECK-NEXT: br label [[FOR_COND15:%.*]] ; CHECK: for.end39: -; CHECK-NEXT: switch i32 %arg2, label [[DO_BODY:%.*]] [ +; CHECK-NEXT: switch i32 [[ARG2:%.*]], label [[DO_BODY:%.*]] [ ; CHECK-NEXT: i32 0, label [[SW_BB:%.*]] ; CHECK-NEXT: i32 1, label [[SW_BB195:%.*]] ; CHECK-NEXT: ] @@ -26,8 +26,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) { ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]]) ; CHECK-NEXT: br label [[SW_EPILOG:%.*]] ; CHECK: sw.bb195: ; CHECK-NEXT: br label [[SW_EPILOG]] @@ -39,7 +38,7 @@ define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) { ; CHECK: if.end.1: ; CHECK-NEXT: br label [[FOR_COND15_1:%.*]] ; CHECK: for.cond15.1: -; CHECK-NEXT: br i1 %arg, label [[FOR_END39:%.*]], label [[FOR_COND15_PREHEADER]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[FOR_END39:%.*]], label [[FOR_COND15_PREHEADER]] ; entry: %conv = sitofp i32 undef to double diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll index 4b6f0438b8915..6e9d9acbe83b0 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -311,15 +311,15 @@ define void @noop_extracts_9_lanes(ptr %ptr.1, ptr %ptr.2) { ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> ; CHECK-NEXT: [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]] ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> @@ -401,10 +401,10 @@ define void @first_mul_chain_jumbled(ptr %ptr.1, ptr %ptr.2) { ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> ; CHECK-NEXT: [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> @@ -491,16 +491,16 @@ define void @first_and_second_mul_chain_jumbled(ptr %ptr.1, ptr %ptr.2) { ; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> ; CHECK-NEXT: [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_2]] ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll index 644d645b9dc88..ff182ae3f56de 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll @@ -10,7 +10,7 @@ ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-1' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '6' +; YAML-NEXT: - TreeSize: '7' define i32 @min_double(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-LABEL: @min_double( ; CHECK-NEXT: entry: @@ -44,7 +44,7 @@ entry: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-2' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '6' +; YAML-NEXT: - TreeSize: '7' define i32 @min_float(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-LABEL: @min_float( ; CHECK-NEXT: entry: @@ -78,7 +78,7 @@ entry: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-1' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '6' +; YAML-NEXT: - TreeSize: '7' define i32 @max_double(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-LABEL: @max_double( ; CHECK-NEXT: entry: @@ -112,7 +112,7 @@ entry: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-2' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '6' +; YAML-NEXT: - TreeSize: '7' define i32 @max_float(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-LABEL: @max_float( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/small-phi-tree.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/small-phi-tree.ll index c4f35d8dfc219..24a8e0e4fd018 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/small-phi-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/small-phi-tree.ll @@ -4,20 +4,19 @@ define float @test(ptr %call78) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> , ptr [[CALL78:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> , ptr [[CALL78:%.*]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY194:%.*]] ; CHECK: for.body194: ; CHECK-NEXT: [[INDVARS_IV132:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[FOR_BODY194]] ] ; CHECK-NEXT: [[CURRENTW_031:%.*]] = phi ptr [ [[CALL78]], [[ENTRY]] ], [ [[PREVIOUSW_030:%.*]], [[FOR_BODY194]] ] ; CHECK-NEXT: [[PREVIOUSW_030]] = phi ptr [ null, [[ENTRY]] ], [ [[CURRENTW_031]], [[FOR_BODY194]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP0]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY194]] ] +; CHECK-NEXT: [[TMP3]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> ; CHECK-NEXT: store float 0.000000e+00, ptr [[CURRENTW_031]], align 4 ; CHECK-NEXT: tail call void null(ptr [[PREVIOUSW_030]], ptr null, ptr null, i32 0, i32 0, ptr null, ptr null, i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> poison, ptr [[CURRENTW_031]], i32 0 -; CHECK-NEXT: [[TMP3]] = insertelement <2 x ptr> [[TMP2]], ptr [[PREVIOUSW_030]], i32 1 ; CHECK-NEXT: br i1 false, label [[FOR_END286_LOOPEXIT:%.*]], label [[FOR_BODY194]] ; CHECK: for.end286.loopexit: -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x ptr> [ [[TMP1]], [[FOR_BODY194]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x ptr> [ [[TMP3]], [[FOR_BODY194]] ] ; CHECK-NEXT: ret float 0.000000e+00 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll index b80be40d9fc86..fc62f4d511041 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll @@ -31,6 +31,6 @@ define void @fun(ptr nocapture, i32 zeroext) local_unnamed_addr #0 { ._crit_edge: ; preds = %.lr.ph ret void -; CHECK: SLP: Adding cost -1 for bundle Idx: 3, n=2 [ %4 = icmp ult i32 %2, %1, ..] +; CHECK: SLP: Adding cost -1 for bundle Idx: 4, n=2 [ %4 = icmp ult i32 %2, %1, ..] } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll index 9fa88084aaa0a..a05d4fdd6315b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll @@ -5,14 +5,12 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i2> @llvm.smin.v2i2(<2 x i2> zeroinitializer, <2 x i2> zeroinitializer) -; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i2> zeroinitializer, <2 x i2> [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i2> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i2> [[TMP2]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = zext i2 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> zeroinitializer, <2 x i32> zeroinitializer) +; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[ADD:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[ADD]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i2> [[TMP2]], i32 0 -; CHECK-NEXT: [[ADD45:%.*]] = zext i2 [[TMP5]] to i32 +; CHECK-NEXT: [[ADD45:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: [[ADD152:%.*]] = or i32 [[ADD45]], [[ADD]] ; CHECK-NEXT: [[IDXPROM153:%.*]] = sext i32 [[ADD152]] to i64 ; CHECK-NEXT: [[ARRAYIDX154:%.*]] = getelementptr i8, ptr null, i64 [[IDXPROM153]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll index bf3f0c4df74e4..5562291dbb6be 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll @@ -31,7 +31,7 @@ define void @test() { ; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une float [[I2]], 0.000000e+00 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP9]] = insertelement <2 x float> [[TMP8]], float [[I2]], i32 0 -; CHECK-NEXT: [[TMP10]] = insertelement <2 x float> [[TMP2]], float [[I2]], i32 0 +; CHECK-NEXT: [[TMP10]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP2]], <2 x i32> ; CHECK-NEXT: br i1 [[TOBOOL]], label [[BB1]], label [[BB2]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll index 469f165d302a9..20d7ba99fd515 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll @@ -8,10 +8,11 @@ define i32 @test(ptr %c, i16 %a, i16 %0) { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP16]] to <4 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i16> [[TMP7]], [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i16 [[A]], -2 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll index d884a1af8aab7..0e08ef4d74308 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll @@ -5,7 +5,6 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr null, align 16 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[TMP12]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> @@ -14,6 +13,7 @@ define i32 @test() { ; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP4]], <4 x i64> [[TMP0]], i64 0) ; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = add <8 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll index 7576eb7a8f55e..878b2370bfd2a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll @@ -7,29 +7,29 @@ define i32 @test(i64 %l.549) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[CONV3:%.*]] = sext i32 0 to i64 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i32 1 ; CHECK-NEXT: br label %[[IF_THEN19:.*]] ; CHECK: [[P:.*]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: br i1 false, label %[[S:.*]], label %[[Q:.*]] ; CHECK: [[Q]]: ; CHECK-NEXT: [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP6]], i64 0) ; CHECK-NEXT: br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]] ; CHECK: [[LOR_LHS_FALSE]]: -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]] ; CHECK: [[R]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP5]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i64> [ [[TMP7]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ] ; CHECK-NEXT: br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]] ; CHECK: [[LAND_LHS_TRUE]]: ; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i64> [ [[TMP8]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ] ; CHECK-NEXT: br i1 false, label %[[Q]], label %[[S]] ; CHECK: [[S]]: -; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP6]], %[[LOR_LHS_FALSE]] ], [ [[TMP2]], %[[P]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP7]], %[[LOR_LHS_FALSE]] ], [ [[TMP17]], %[[P]] ] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> ; CHECK-NEXT: br label %[[IF_THEN19]] ; CHECK: [[IF_THEN19]]: @@ -37,7 +37,7 @@ define i32 @test(i64 %l.549) { ; CHECK-NEXT: [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[L_549]], i32 1 -; CHECK-NEXT: [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> zeroinitializer, i64 2) +; CHECK-NEXT: [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> [[TMP2]], i64 2) ; CHECK-NEXT: br i1 false, label %[[R]], label %[[IF_END25]] ; CHECK: [[IF_END25]]: ; CHECK-NEXT: br i1 false, label %[[IF_END29]], label %[[P]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index a7a92bad5e5c1..19c29be1ef384 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -11,21 +11,19 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP9]], <2 x float> [[TMP8]], i64 0) ; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP16]], ptr [[RC21]], align 4 ; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: entry.if.end72_crit_edge: ; CHECK-NEXT: br label [[IF_END72:%.*]] @@ -48,7 +46,8 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float> -; CHECK-NEXT: store <4 x float> [[TMP26]], ptr [[RC21]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll index fa33621de5ae7..b39480b12496b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll @@ -33,12 +33,12 @@ define i64 @foo() { ; FORCED: bb3: ; FORCED-NEXT: [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ] ; FORCED-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ] +; FORCED-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> , <2 x i32> ; FORCED-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> , i64 [[PHI5]], i32 0 +; FORCED-NEXT: [[TMP7]] = add <2 x i64> [[TMP6]], [[TMP2]] ; FORCED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] ; FORCED-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]] ; FORCED-NEXT: [[TMP5]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; FORCED-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> , <2 x i32> -; FORCED-NEXT: [[TMP7]] = add <2 x i64> [[TMP6]], [[TMP2]] ; FORCED-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[TMP8]] ; FORCED-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 ; FORCED-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll index 63b41627106e5..53bffe502f3da 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -13,7 +13,7 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK: while.body: ; CHECK-NEXT: [[A_020:%.*]] = phi ptr [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64 ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 @@ -24,7 +24,7 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2 @@ -36,7 +36,7 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> splat (i64 2) -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 1 ; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll index d649465c9ff12..7960278d2b21d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll @@ -8,9 +8,9 @@ ; YAML: Function: test ; YAML: Args: ; YAML: - String: 'Stores SLP vectorized with cost ' -; YAML: - Cost: '-3' +; YAML: - Cost: '-6' ; YAML: - String: ' and with tree size ' -; YAML: - TreeSize: '14' +; YAML: - TreeSize: '16' ; YAML: ... ; Test that SLP cost modeling is able to match gathering tree diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll index 58ea4f8da01a4..60c067e6555ef 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll @@ -18,22 +18,17 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) { ; CHECK-NEXT: [[TMP8:%.*]] = and <2 x i24> [[TMP7]], splat (i24 255) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], splat (i24 24) ; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> splat (i24 23), <2 x i24> [[TMP8]] -; CHECK-NEXT: [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8> -; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i24> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = and <2 x i32> [[TMP26]], splat (i32 254) ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i32> [[TMP13]], splat (i32 4) -; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP15]], <2 x i8> splat (i8 2), <2 x i8> [[TMP23]] -; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> splat (i32 2), <2 x i32> [[TMP26]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <2 x i32> [[TMP14]], splat (i32 32) -; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP17]], <2 x i8> splat (i8 31), <2 x i8> [[TMP25]] -; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP17]], <2 x i32> splat (i32 31), <2 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq <2 x i32> [[TMP16]], splat (i32 54) -; CHECK-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP27]], <2 x i8> splat (i8 53), <2 x i8> [[TMP18]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i8> [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = zext i8 [[TMP22]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> splat (i32 53), <2 x i32> [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0 ; CHECK-NEXT: store i32 [[TMP19]], ptr [[P1]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i8> [[TMP21]], i32 1 -; CHECK-NEXT: [[TMP20:%.*]] = zext i8 [[TMP24]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP18]], i32 1 ; CHECK-NEXT: [[CMP210_NOT:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]] ; CHECK-NEXT: ret i1 [[CMP210_NOT]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll index d3d7f21ee1003..55f2b238c07df 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbw-node-used-twice.ll @@ -8,9 +8,9 @@ define i8 @test() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> , i16 [[SUB_I_I79_PEEL_I]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i16> [[TMP3]], [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i16> [[TMP4]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: [[CONV13_I89_PEEL_I:%.*]] = zext i1 [[TMP5]] to i8 ; CHECK-NEXT: ret i8 [[CONV13_I89_PEEL_I]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/perfect-matched-reused-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/perfect-matched-reused-bv.ll index 1053e0fc10669..4c9867a6fab75 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/perfect-matched-reused-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/perfect-matched-reused-bv.ll @@ -7,16 +7,16 @@ define void @test() { ; CHECK-NEXT: [[BB:.*]]: ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB4:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP3:%.*]], %[[BB4:.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[TMP0]], zeroinitializer ; CHECK-NEXT: br i1 false, label %[[BB7:.*]], label %[[BB4]] ; CHECK: [[BB4]]: ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP4]] = add <2 x i32> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[TMP3]] = add <2 x i32> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]] ; CHECK-NEXT: br i1 false, label %[[BB7]], label %[[BB1]] ; CHECK: [[BB7]]: -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP1]], %[[BB1]] ], [ [[TMP3]], %[[BB4]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP1]], %[[BB1]] ], [ [[TMP4]], %[[BB4]] ] ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll index 22e7e6a8e6624..6e770bdf6eb0c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll @@ -4,10 +4,10 @@ define void @test(float %0) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> , float [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fdiv <2 x float> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x float> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fdiv <2 x float> [[TMP6]], zeroinitializer ; CHECK-NEXT: br label %[[BB6:.*]] ; CHECK: [[BB6]]: ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-as-operand-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-as-operand-reorder.ll index 51ce970bf06bc..e91a068a8d240 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-as-operand-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-as-operand-reorder.ll @@ -6,15 +6,15 @@ define void @test() { ; CHECK-NEXT: [[BB:.*]]: ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP3:%.*]], %[[BB3:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP2:%.*]], %[[BB3:.*]] ] ; CHECK-NEXT: br i1 false, label %[[BB6:.*]], label %[[BB3]] ; CHECK: [[BB3]]: ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3]] = add <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP2]] = add <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP1]] ; CHECK-NEXT: br i1 false, label %[[BB6]], label %[[BB1]] ; CHECK: [[BB6]]: -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ [[TMP0]], %[[BB1]] ], [ [[TMP2]], %[[BB3]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ [[TMP0]], %[[BB1]] ], [ [[TMP3]], %[[BB3]] ] ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll index ea6d96147c951..efd44f5a85664 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-mask-with-poison-index.ll @@ -14,9 +14,9 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) { ; CHECK-NEXT: [[TMP12:%.*]] = phi float [ [[TMP7]], [[TMP5]] ], [ [[TMP61:%.*]], %[[TMP56]] ] ; CHECK-NEXT: [[TMP13:%.*]] = phi float [ [[TMP6]], [[TMP5]] ], [ [[TMP62:%.*]], %[[TMP56]] ] ; CHECK-NEXT: [[TMP14:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP63:%.*]], %[[TMP56]] ] -; CHECK-NEXT: [[TMP15:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP64:%.*]], %[[TMP56]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP64:%.*]], %[[TMP56]] ] ; CHECK-NEXT: [[TMP16:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP65:%.*]], %[[TMP56]] ] -; CHECK-NEXT: [[TMP17:%.*]] = phi float [ undef, [[TMP5]] ], [ [[TMP66:%.*]], %[[TMP56]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi float [ undef, [[TMP5]] ], [ [[TMP66:%.*]], %[[TMP56]] ] ; CHECK-NEXT: [[TMP18:%.*]] = phi float [ 0.000000e+00, [[TMP5]] ], [ [[TMP67:%.*]], %[[TMP56]] ] ; CHECK-NEXT: [[TMP19:%.*]] = phi float [ [[TMP4]], [[TMP5]] ], [ [[TMP68:%.*]], %[[TMP56]] ] ; CHECK-NEXT: [[TMP20:%.*]] = phi float [ [[TMP4]], [[TMP5]] ], [ [[TMP69:%.*]], %[[TMP56]] ] @@ -37,15 +37,15 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) { ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x float> [[TMP32]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP33]], <4 x float> zeroinitializer, <4 x float> zeroinitializer) -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP36:%.*]] = fsub float [[TMP17]], [[TMP35]] -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP25]], i32 1 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x float> [[TMP25]], i32 0 ; CHECK-NEXT: [[TMP38:%.*]] = fsub float [[TMP15]], [[TMP37]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x float> [[TMP25]], i32 1 +; CHECK-NEXT: [[TMP49:%.*]] = fsub float [[TMP17]], [[TMP48]] ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP14]], i64 0 ; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <4 x float> [[TMP25]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <2 x float> [[TMP39]], <2 x float> [[TMP40]], <2 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP38]], i64 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP36]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP49]], i64 0 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP38]], i64 0 ; CHECK-NEXT: [[TMP44:%.*]] = fmul <2 x float> [[TMP42]], [[TMP43]] ; CHECK-NEXT: [[TMP45:%.*]] = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP41]], <2 x float> [[TMP26]], <2 x float> [[TMP44]]) ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP45]], i64 0 @@ -72,9 +72,9 @@ define fastcc i32 @test(ptr %0, <2 x float> %1, i1 %2, float %3, float %4) { ; CHECK-NEXT: [[TMP61]] = phi float [ [[TMP12]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ] ; CHECK-NEXT: [[TMP62]] = phi float [ [[TMP13]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP54]], %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ] ; CHECK-NEXT: [[TMP63]] = phi float [ [[TMP14]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP9]], %[[BB50]] ] -; CHECK-NEXT: [[TMP64]] = phi float [ [[TMP15]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP55]], %[[BB53]] ], [ [[TMP10]], %[[BB50]] ] +; CHECK-NEXT: [[TMP64]] = phi float [ [[TMP17]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP55]], %[[BB53]] ], [ [[TMP10]], %[[BB50]] ] ; CHECK-NEXT: [[TMP65]] = phi float [ [[TMP16]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP11]], %[[BB50]] ] -; CHECK-NEXT: [[TMP66]] = phi float [ [[TMP17]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ] +; CHECK-NEXT: [[TMP66]] = phi float [ [[TMP15]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ] ; CHECK-NEXT: [[TMP67]] = phi float [ [[TMP18]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ] ; CHECK-NEXT: [[TMP68]] = phi float [ [[TMP19]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ 0.000000e+00, %[[BB53]] ], [ [[TMP3]], %[[BB50]] ] ; CHECK-NEXT: [[TMP69]] = phi float [ [[TMP20]], %[[BB29]] ], [ 0.000000e+00, %[[BB27]] ], [ 0.000000e+00, %[[BB8]] ], [ [[TMP54]], %[[BB53]] ], [ 0.000000e+00, %[[BB50]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll index 242d66fda569a..648f051db4a52 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -7,9 +7,8 @@ define void @wombat(ptr %ptr, ptr %ptr1) { ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i32 3 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], splat (i32 -1) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef ; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> undef, <4 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll index de4358c47cfd0..eae0ed466b0c7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll @@ -9,9 +9,9 @@ define void @test() { ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i1> [ [[TMP6:%.*]], [[TMP1:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[TMP1]] ; CHECK: 1: -; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i1> [[TMP0]] to <2 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i8> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i8> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i1> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i8> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6]] = and <2 x i1> [[TMP5]], zeroinitializer ; CHECK-NEXT: br label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll index 83e1bef8fa066..d07353798edc9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll @@ -11,11 +11,11 @@ define i1 @test(i64 %v1, ptr %v2, i32 %v3, i1 %v4) { ; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i8> [[TMP3]], ; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP30]], <2 x i32> [[TMP5]], i64 0) ; CHECK-NEXT: [[TMP11:%.*]] = uitofp <4 x i32> [[TMP10]] to <4 x float> ; CHECK-NEXT: [[TMP12:%.*]] = fdiv <4 x float> zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> poison, i1 [[V4]], i32 0